From 3493e261d7ed9bb6dfadaa62e8c9c04006181275 Mon Sep 17 00:00:00 2001 From: "weiye.wang" Date: Fri, 3 Mar 2023 21:32:35 +0800 Subject: [PATCH] 20230303 evening --- 工具/新题比对.ipynb | 192 +++++----- 工具/相似题目检测.ipynb | 763 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 851 insertions(+), 104 deletions(-) diff --git a/工具/新题比对.ipynb b/工具/新题比对.ipynb index 9472afca..94b58277 100644 --- a/工具/新题比对.ipynb +++ b/工具/新题比对.ipynb @@ -2,88 +2,109 @@ "cells": [ { "cell_type": "code", - "execution_count": 22, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1.000\t1\t021365\n", - "1.000\t2\t021366\n", - "1.000\t3\t021367\n", - "1.000\t4\t021368\n", - "0.944\t5\t021369\n", - "0.947\t6\t021370\n", - "0.917\t7\t021372\n", - "1.000\t8\t021371\n", - "1.000\t9\t021373\n", - "1.000\t10\t021374\n", - "1.000\t11\t021375\n", - "1.000\t12\t021376\n", - "0.966\t13\t021377\n", - "1.000\t14\t022022\n", - "0.902\t15\t021379\n", - "0.865\t16\t021382\n", - "1.000\t17\t022023\n", - "0.987\t18\t021383\n", - "1.000\t19\t021384\n", - "1.000\t20\t021385\n", - "1.000\t21\t021386\n", - "1.000\t22\t021387\n", - "1.000\t23\t021389\n", - "1.000\t24\t022024\n", - "1.000\t25\t021390\n", - "1.000\t26\t022026\n", - "0.891\t27\t021392\n", - "0.965\t28\t021393\n", - "0.986\t29\t021394\n", - "0.940\t30\t021395\n", - "1.000\t31\t021396\n", - "1.000\t32\t022027\n", - "1.000\t33\t021397\n", - "1.000\t34\t022028\n", - "0.805\t35\t021401\n", - "1.000\t36\t021403\n", - "1.000\t37\t022029\n", - "1.000\t38\t022030\n", - "1.000\t39\t022031\n", - "1.000\t40\t022032\n", - "1.000\t41\t022033\n", - "1.000\t42\t022034\n", - "0.887\t43\t021410\n", - "1.000\t44\t022035\n", - "1.000\t45\t022036\n", - "1.000\t46\t022037\n", - "1.000\t47\t021413\n", - "0.959\t48\t022038\n", - "1.000\t49\t022039\n", - "1.000\t50\t021415\n", - "1.000\t51\t022040\n", - "1.000\t52\t022041\n", - "0.793\t53\t021418\n", - "0.807\t54\t021420\n", - "0.693\t55\t021421\n", - "1.000\t56\t021422\n", - "1.000\t57\t021423\n", - "1.000\t58\t022042\n", - "1.000\t59\t022043\n", - "0.805\t60\t021427\n", - "0.957\t61\t021425\n", - "0.770\t62\t021428\n", - "0.970\t63\t022044\n", - "1.000\t64\t022045\n", - "0.738\t65\t021430\n", - "1.000\t66\t022046\n", - "1.000\t67\t022047\n", - "0.792\t68\t021432\n", - "0.793\t69\t021434\n", - "0.721\t70\t021433\n", - "0.811\t71\t021435\n", - "0.728\t72\t021436\n", - "1.000\t73\t021437\n", - "0.989\t74\t021438\n", - "0.848\t75\t021440\n" + "1.000\t1\t004884\n", + "1.000\t2\t003673\n", + "0.903\t3\t000506\n", + "0.984\t4\t003665\n", + "0.921\t5\t012805\n", + "0.949\t6\t011578\n", + "0.987\t7\t011716\n", + "0.987\t8\t011674\n", + "0.867\t9\t000041\n", + "0.939\t10\t002911\n", + "0.876\t11\t011630\n", + "0.785\t12\t002858\n", + "0.990\t13\t011636\n", + "0.848\t14\t011186\n", + "0.763\t15\t011186\n", + "0.662\t16\t003625\n", + "0.971\t17\t011687\n", + "0.910\t18\t011712\n", + "0.685\t19\t003857\n", + "0.887\t20\t012107\n", + "1.000\t21\t011594\n", + "0.803\t22\t040098\n", + "1.000\t23\t011708\n", + "1.000\t24\t011724\n", + "1.000\t25\t011639\n", + "0.819\t26\t012756\n", + "1.000\t27\t011670\n", + "1.000\t28\t011608\n", + "0.995\t29\t011728\n", + "0.780\t30\t001553\n", + "0.876\t31\t009993\n", + "0.948\t32\t003638\n", + "0.628\t33\t003607\n", + "0.702\t34\t012316\n", + "0.686\t35\t022043\n", + "0.671\t36\t021435\n", + "0.957\t37\t011611\n", + "0.551\t38\t040015\n", + "0.935\t39\t012448\n", + "1.000\t40\t011648\n", + "1.000\t41\t011671\n", + "0.665\t42\t000387\n", + "0.777\t43\t003624\n", + "0.993\t44\t003666\n", + "1.000\t45\t012195\n", + "0.942\t46\t009988\n", + "1.000\t47\t011696\n", + "1.000\t48\t011631\n", + "0.602\t49\t004391\n", + "0.879\t50\t010710\n", + "1.000\t51\t011721\n", + "0.903\t52\t013057\n", + "0.799\t53\t009074\n", + "0.984\t54\t011686\n", + "0.975\t55\t011718\n", + "0.993\t56\t003733\n", + "1.000\t57\t000629\n", + "1.000\t58\t011697\n", + "0.998\t59\t011736\n", + "0.994\t60\t011700\n", + "1.000\t61\t003674\n", + "0.632\t62\t007439\n", + "0.986\t63\t012746\n", + "0.850\t64\t000512\n", + "0.684\t65\t012743\n", + "0.590\t66\t009751\n", + "0.800\t67\t010005\n", + "0.730\t68\t013272\n", + "0.975\t69\t004037\n", + "0.614\t70\t010551\n", + "0.867\t71\t013396\n", + "0.875\t72\t012745\n", + "0.994\t73\t012100\n", + "0.680\t74\t012289\n", + "1.000\t75\t021151\n", + "0.557\t76\t013684\n", + "0.527\t77\t031184\n", + "0.683\t78\t000659\n", + "0.795\t79\t012359\n", + "0.698\t80\t000283\n", + "0.761\t81\t009445\n", + "0.849\t82\t008965\n", + "0.618\t83\t005293\n", + "0.589\t84\t005293\n", + "0.785\t85\t020601\n", + "0.613\t86\t031069\n", + "0.775\t87\t009921\n", + "0.700\t88\t001769\n", + "0.636\t89\t010481\n", + "0.703\t90\t012295\n", + "0.734\t91\t011946\n", + "0.670\t92\t012825\n", + "0.728\t93\t020841\n", + "0.686\t94\t009040\n", + "0.549\t95\t009718\n", + "0.554\t96\t004998\n" ] } ], @@ -91,11 +112,11 @@ "import os,re,difflib,Levenshtein,time,json\n", "\n", "# 重要!!! 范围\n", - "old_problems_range = \"21365:21440,22022:22047\"\n", + "old_problems_range = \"1:999999\"\n", "threshold = 0.85\n", "\n", "# 待比对的文件\n", - "filename = r\"D:\\temp\\derivatives.tex\"\n", + "filename = r\"C:\\Users\\weiye\\Documents\\wwy sync\\临时工作区\\空中课堂第六批.tex\"\n", "\n", "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n", "def generate_number_set(string):\n", @@ -168,7 +189,8 @@ "new_pro_list = GenerateProblemListFromString(newdatabase)\n", "\n", "pro_dict_treated = {}\n", - "idrange = generate_number_set(old_problems_range)\n", + "idrange_raw = generate_number_set(old_problems_range)\n", + "idrange = [id for id in pro_dict if id in idrange_raw]\n", "for p in idrange:\n", " pro_dict_treated[p] = pre_treating(pro_dict[p][\"content\"])\n", "\n", @@ -193,22 +215,22 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "75" + "999999" ] }, - "execution_count": 16, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(new_dict_treated)" + "len(idrange)" ] }, { @@ -235,12 +257,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.15" + "version": "3.9.15" }, "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "42dd566da87765ddbe9b5c5b483063747fec4aacc5469ad554706e4b742e67b2" + "hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc" } } }, diff --git a/工具/相似题目检测.ipynb b/工具/相似题目检测.ipynb index 21a48a43..a3de6d22 100644 --- a/工具/相似题目检测.ipynb +++ b/工具/相似题目检测.ipynb @@ -2,24 +2,741 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "旧题目数: 12858 , 新题目数: 191\n", + "旧题目数: 0 , 新题目数: 18100\n", "开始新题与旧题的比对\n", "50\n", "100\n", "150\n", + "200\n", + "250\n", + "300\n", + "350\n", + "400\n", + "450\n", + "500\n", + "550\n", + "600\n", + "650\n", + "700\n", + "750\n", + "800\n", + "850\n", + "900\n", + "950\n", + "1000\n", + "1050\n", + "1100\n", + "1150\n", + "1200\n", + "1250\n", + "1300\n", + "1350\n", + "1400\n", + "1450\n", + "1500\n", + "1550\n", + "1600\n", + "1650\n", + "1700\n", + "1750\n", + "1800\n", + "1850\n", + "1900\n", + "1950\n", + "2000\n", + "2050\n", + "2100\n", + "2150\n", + "2200\n", + "2250\n", + "2300\n", + "2350\n", + "2400\n", + "2450\n", + "2500\n", + "2550\n", + "2600\n", + "2650\n", + "2700\n", + "2750\n", + "2800\n", + "2850\n", + "2900\n", + "2950\n", + "3000\n", + "3050\n", + "3100\n", + "3150\n", + "3200\n", + "3250\n", + "3300\n", + "3350\n", + "3400\n", + "3450\n", + "3500\n", + "3550\n", + "3600\n", + "3650\n", + "3700\n", + "3750\n", + "3800\n", + "3850\n", + "3900\n", + "3950\n", + "4000\n", + "4050\n", + "4100\n", + "4150\n", + "4200\n", + "4250\n", + "4300\n", + "4350\n", + "4400\n", + "4450\n", + "4500\n", + "4550\n", + "4600\n", + "4650\n", + "4700\n", + "4750\n", + "4800\n", + "4850\n", + "4900\n", + "4950\n", + "5000\n", + "5050\n", + "5100\n", + "5150\n", + "5200\n", + "5250\n", + "5300\n", + "5350\n", + "5400\n", + "5450\n", + "5500\n", + "5550\n", + "5600\n", + "5650\n", + "5700\n", + "5750\n", + "5800\n", + "5850\n", + "5900\n", + "5950\n", + "6000\n", + "6050\n", + "6100\n", + "6150\n", + "6200\n", + "6250\n", + "6300\n", + "6350\n", + "6400\n", + "6450\n", + "6500\n", + "6550\n", + "6600\n", + "6650\n", + "6700\n", + "6750\n", + "6800\n", + "6850\n", + "6900\n", + "6950\n", + "7000\n", + "7050\n", + "7100\n", + "7150\n", + "7200\n", + "7250\n", + "7300\n", + "7350\n", + "7400\n", + "7450\n", + "7500\n", + "7550\n", + "7600\n", + "7650\n", + "7700\n", + "7750\n", + "7800\n", + "7850\n", + "7900\n", + "7950\n", + "8000\n", + "8050\n", + "8100\n", + "8150\n", + "8200\n", + "8250\n", + "8300\n", + "8350\n", + "8400\n", + "8450\n", + "8500\n", + "8550\n", + "8600\n", + "8650\n", + "8700\n", + "8750\n", + "8800\n", + "8850\n", + "8900\n", + "8950\n", + "9000\n", + "9050\n", + "9100\n", + "9150\n", + "9200\n", + "9250\n", + "9300\n", + "9350\n", + "9400\n", + "9450\n", + "9500\n", + "9550\n", + "9600\n", + "9650\n", + "9700\n", + "9750\n", + "9800\n", + "9850\n", + "9900\n", + "9950\n", + "10000\n", + "10050\n", + "10100\n", + "10150\n", + "10200\n", + "10250\n", + "10300\n", + "10350\n", + "10400\n", + "10450\n", + "10500\n", + "10550\n", + "10600\n", + "10650\n", + "10700\n", + "10750\n", + "10800\n", + "10850\n", + "10900\n", + "10950\n", + "11000\n", + "11050\n", + "11100\n", + "11150\n", + "11200\n", + "11250\n", + "11300\n", + "11350\n", + "11400\n", + "11450\n", + "11500\n", + "11550\n", + "11600\n", + "11650\n", + "11700\n", + "11750\n", + "11800\n", + "11850\n", + "11900\n", + "11950\n", + "12000\n", + "12050\n", + "12100\n", + "12150\n", + "12200\n", + "12250\n", + "12300\n", + "12350\n", + "12400\n", + "12450\n", + "12500\n", + "12550\n", + "12600\n", + "12650\n", + "12700\n", + "12750\n", + "12800\n", + "12850\n", + "12900\n", + "12950\n", + "13000\n", + "13050\n", + "13100\n", + "13150\n", + "13200\n", + "13250\n", + "13300\n", + "13350\n", + "13400\n", + "13450\n", + "13500\n", + "13550\n", + "13600\n", + "13650\n", + "13700\n", + "13750\n", + "13800\n", + "13850\n", + "13900\n", + "13950\n", + "14000\n", + "14050\n", + "14100\n", + "14150\n", + "14200\n", + "14250\n", + "14300\n", + "14350\n", + "14400\n", + "14450\n", + "14500\n", + "14550\n", + "14600\n", + "14650\n", + "14700\n", + "14750\n", + "14800\n", + "14850\n", + "14900\n", + "14950\n", + "15000\n", + "15050\n", + "15100\n", + "15150\n", + "15200\n", + "15250\n", + "15300\n", + "15350\n", + "15400\n", + "15450\n", + "15500\n", + "15550\n", + "15600\n", + "15650\n", + "15700\n", + "15750\n", + "15800\n", + "15850\n", + "15900\n", + "15950\n", + "16000\n", + "16050\n", + "16100\n", + "16150\n", + "16200\n", + "16250\n", + "16300\n", + "16350\n", + "16400\n", + "16450\n", + "16500\n", + "16550\n", + "16600\n", + "16650\n", + "16700\n", + "16750\n", + "16800\n", + "16850\n", + "16900\n", + "16950\n", + "17000\n", + "17050\n", + "17100\n", + "17150\n", + "17200\n", + "17250\n", + "17300\n", + "17350\n", + "17400\n", + "17450\n", + "17500\n", + "17550\n", + "17600\n", + "17650\n", + "17700\n", + "17750\n", + "17800\n", + "17850\n", + "17900\n", + "17950\n", + "18000\n", + "18050\n", + "18100\n", "开始新题之间的比对\n", "50\n", "100\n", "150\n", - "总耗时: 79.84281706809998 秒.\n", - "发现相似: 252 , 其中已标注: 0 .\n" + "200\n", + "250\n", + "300\n", + "350\n", + "400\n", + "450\n", + "500\n", + "550\n", + "600\n", + "650\n", + "700\n", + "750\n", + "800\n", + "850\n", + "900\n", + "950\n", + "1000\n", + "1050\n", + "1100\n", + "1150\n", + "1200\n", + "1250\n", + "1300\n", + "1350\n", + "1400\n", + "1450\n", + "1500\n", + "1550\n", + "1600\n", + "1650\n", + "1700\n", + "1750\n", + "1800\n", + "1850\n", + "1900\n", + "1950\n", + "2000\n", + "2050\n", + "2100\n", + "2150\n", + "2200\n", + "2250\n", + "2300\n", + "2350\n", + "2400\n", + "2450\n", + "2500\n", + "2550\n", + "2600\n", + "2650\n", + "2700\n", + "2750\n", + "2800\n", + "2850\n", + "2900\n", + "2950\n", + "3000\n", + "3050\n", + "3100\n", + "3150\n", + "3200\n", + "3250\n", + "3300\n", + "3350\n", + "3400\n", + "3450\n", + "3500\n", + "3550\n", + "3600\n", + "3650\n", + "3700\n", + "3750\n", + "3800\n", + "3850\n", + "3900\n", + "3950\n", + "4000\n", + "4050\n", + "4100\n", + "4150\n", + "4200\n", + "4250\n", + "4300\n", + "4350\n", + "4400\n", + "4450\n", + "4500\n", + "4550\n", + "4600\n", + "4650\n", + "4700\n", + "4750\n", + "4800\n", + "4850\n", + "4900\n", + "4950\n", + "5000\n", + "5050\n", + "5100\n", + "5150\n", + "5200\n", + "5250\n", + "5300\n", + "5350\n", + "5400\n", + "5450\n", + "5500\n", + "5550\n", + "5600\n", + "5650\n", + "5700\n", + "5750\n", + "5800\n", + "5850\n", + "5900\n", + "5950\n", + "6000\n", + "6050\n", + "6100\n", + "6150\n", + "6200\n", + "6250\n", + "6300\n", + "6350\n", + "6400\n", + "6450\n", + "6500\n", + "6550\n", + "6600\n", + "6650\n", + "6700\n", + "6750\n", + "6800\n", + "6850\n", + "6900\n", + "6950\n", + "7000\n", + "7050\n", + "7100\n", + "7150\n", + "7200\n", + "7250\n", + "7300\n", + "7350\n", + "7400\n", + "7450\n", + "7500\n", + "7550\n", + "7600\n", + "7650\n", + "7700\n", + "7750\n", + "7800\n", + "7850\n", + "7900\n", + "7950\n", + "8000\n", + "8050\n", + "8100\n", + "8150\n", + "8200\n", + "8250\n", + "8300\n", + "8350\n", + "8400\n", + "8450\n", + "8500\n", + "8550\n", + "8600\n", + "8650\n", + "8700\n", + "8750\n", + "8800\n", + "8850\n", + "8900\n", + "8950\n", + "9000\n", + "9050\n", + "9100\n", + "9150\n", + "9200\n", + "9250\n", + "9300\n", + "9350\n", + "9400\n", + "9450\n", + "9500\n", + "9550\n", + "9600\n", + "9650\n", + "9700\n", + "9750\n", + "9800\n", + "9850\n", + "9900\n", + "9950\n", + "10000\n", + "10050\n", + "10100\n", + "10150\n", + "10200\n", + "10250\n", + "10300\n", + "10350\n", + "10400\n", + "10450\n", + "10500\n", + "10550\n", + "10600\n", + "10650\n", + "10700\n", + "10750\n", + "10800\n", + "10850\n", + "10900\n", + "10950\n", + "11000\n", + "11050\n", + "11100\n", + "11150\n", + "11200\n", + "11250\n", + "11300\n", + "11350\n", + "11400\n", + "11450\n", + "11500\n", + "11550\n", + "11600\n", + "11650\n", + "11700\n", + "11750\n", + "11800\n", + "11850\n", + "11900\n", + "11950\n", + "12000\n", + "12050\n", + "12100\n", + "12150\n", + "12200\n", + "12250\n", + "12300\n", + "12350\n", + "12400\n", + "12450\n", + "12500\n", + "12550\n", + "12600\n", + "12650\n", + "12700\n", + "12750\n", + "12800\n", + "12850\n", + "12900\n", + "12950\n", + "13000\n", + "13050\n", + "13100\n", + "13150\n", + "13200\n", + "13250\n", + "13300\n", + "13350\n", + "13400\n", + "13450\n", + "13500\n", + "13550\n", + "13600\n", + "13650\n", + "13700\n", + "13750\n", + "13800\n", + "13850\n", + "13900\n", + "13950\n", + "14000\n", + "14050\n", + "14100\n", + "14150\n", + "14200\n", + "14250\n", + "14300\n", + "14350\n", + "14400\n", + "14450\n", + "14500\n", + "14550\n", + "14600\n", + "14650\n", + "14700\n", + "14750\n", + "14800\n", + "14850\n", + "14900\n", + "14950\n", + "15000\n", + "15050\n", + "15100\n", + "15150\n", + "15200\n", + "15250\n", + "15300\n", + "15350\n", + "15400\n", + "15450\n", + "15500\n", + "15550\n", + "15600\n", + "15650\n", + "15700\n", + "15750\n", + "15800\n", + "15850\n", + "15900\n", + "15950\n", + "16000\n", + "16050\n", + "16100\n", + "16150\n", + "16200\n", + "16250\n", + "16300\n", + "16350\n", + "16400\n", + "16450\n", + "16500\n", + "16550\n", + "16600\n", + "16650\n", + "16700\n", + "16750\n", + "16800\n", + "16850\n", + "16900\n", + "16950\n", + "17000\n", + "17050\n", + "17100\n", + "17150\n", + "17200\n", + "17250\n", + "17300\n", + "17350\n", + "17400\n", + "17450\n", + "17500\n", + "17550\n", + "17600\n", + "17650\n", + "17700\n", + "17750\n", + "17800\n", + "17850\n", + "17900\n", + "17950\n", + "18000\n", + "18050\n", + "总耗时: 384.6457269191742 秒.\n", + "发现相似: 2800 , 其中已标注: 2223 .\n" ] } ], @@ -28,8 +745,8 @@ "import os,re,difflib,Levenshtein,time,json\n", "\n", "# 重要!!! 新题目的范围\n", - "id_new_problems = \"12138:12328\"\n", - "threshold = 0.85\n", + "id_new_problems = \"1:50000\"\n", + "threshold = 0.99\n", "\n", "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n", "def generate_number_set(string):\n", @@ -57,20 +774,20 @@ "\n", "#difflab字符串比较\n", "def difflab_get_equal_rate(str1, str2):\n", - " str1 = pre_treating(str1)\n", - " str2 = pre_treating(str2)\n", + " # str1 = pre_treating(str1)\n", + " # str2 = pre_treating(str2)\n", " return difflib.SequenceMatcher(None, str1, str2).ratio()\n", "\n", "#Levenshtein jaro字符串比较\n", "def jaro_get_equal_rate(str1,str2):\n", - " str1 = pre_treating(str1)\n", - " str2 = pre_treating(str2)\n", + " # str1 = pre_treating(str1)\n", + " # str2 = pre_treating(str2)\n", " return Levenshtein.jaro(str1,str2)\n", "\n", "#Levenshtein 字符串比较\n", "def Lev_get_equal_rate(str1,str2):\n", - " str1 = pre_treating(str1)\n", - " str2 = pre_treating(str2)\n", + " # str1 = pre_treating(str1)\n", + " # str2 = pre_treating(str2)\n", " return Levenshtein.ratio(str1,str2)\n", "\n", "\n", @@ -85,14 +802,19 @@ "pro_dict = json.loads(database)\n", "\n", "#生成旧题目数据库字典与新题目数据库字典\n", - "new_id_list = generate_number_set(id_new_problems)\n", + "new_id_list_raw = generate_number_set(id_new_problems)\n", + "new_id_list = [id for id in pro_dict if id in new_id_list_raw]\n", "old_problems_dict = {}\n", "new_problems_dict = {}\n", + "old_problems_dict_content = {}\n", + "new_problems_dict_content = {}\n", "for id in pro_dict:\n", " if id in new_id_list:\n", " new_problems_dict[id] = pro_dict[id]\n", + " new_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n", " else:\n", " old_problems_dict[id] = pro_dict[id]\n", + " old_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n", "print(\"旧题目数:\",len(old_problems_dict),\", 新题目数:\",len(new_problems_dict))\n", "\n", "#记录起始时间\n", @@ -102,6 +824,8 @@ "\n", "alike_problems = \"\"\n", "\n", + "\n", + "\n", "#开始新题与旧题的比对\n", "count = 0\n", "print(\"开始新题与旧题的比对\")\n", @@ -110,7 +834,7 @@ " if count % 50 == 0:\n", " print(count)\n", " for id_old in old_problems_dict:\n", - " similar_rate = sim_test(new_problems_dict[id_new][\"content\"],old_problems_dict[id_old][\"content\"])\n", + " similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old])\n", " if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n", @@ -127,8 +851,9 @@ " print(count)\n", " keys = list(new_problems_dict.keys())\n", " current_problem = new_problems_dict.pop(keys[0])\n", + " current_problem_content = new_problems_dict_content[current_problem[\"id\"]]\n", " for id_new in new_problems_dict:\n", - " similar_rate = sim_test(new_problems_dict[id_new][\"content\"],current_problem[\"content\"])\n", + " similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content)\n", " if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n", @@ -143,7 +868,7 @@ "print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n", "\n", "with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n", - " f.write(alike_problems)\n" + " f.write(alike_problems)" ] }, { @@ -156,7 +881,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.9.13 ('base')", + "display_name": "mathdept", "language": "python", "name": "python3" }, @@ -170,12 +895,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]" + "version": "3.9.15" }, "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186" + "hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc" } } },