20230303 evening

This commit is contained in:
weiye.wang 2023-03-03 21:32:35 +08:00
parent 0ca515887b
commit 3493e261d7
2 changed files with 851 additions and 104 deletions

View File

@ -2,88 +2,109 @@
"cells": [
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.000\t1\t021365\n",
"1.000\t2\t021366\n",
"1.000\t3\t021367\n",
"1.000\t4\t021368\n",
"0.944\t5\t021369\n",
"0.947\t6\t021370\n",
"0.917\t7\t021372\n",
"1.000\t8\t021371\n",
"1.000\t9\t021373\n",
"1.000\t10\t021374\n",
"1.000\t11\t021375\n",
"1.000\t12\t021376\n",
"0.966\t13\t021377\n",
"1.000\t14\t022022\n",
"0.902\t15\t021379\n",
"0.865\t16\t021382\n",
"1.000\t17\t022023\n",
"0.987\t18\t021383\n",
"1.000\t19\t021384\n",
"1.000\t20\t021385\n",
"1.000\t21\t021386\n",
"1.000\t22\t021387\n",
"1.000\t23\t021389\n",
"1.000\t24\t022024\n",
"1.000\t25\t021390\n",
"1.000\t26\t022026\n",
"0.891\t27\t021392\n",
"0.965\t28\t021393\n",
"0.986\t29\t021394\n",
"0.940\t30\t021395\n",
"1.000\t31\t021396\n",
"1.000\t32\t022027\n",
"1.000\t33\t021397\n",
"1.000\t34\t022028\n",
"0.805\t35\t021401\n",
"1.000\t36\t021403\n",
"1.000\t37\t022029\n",
"1.000\t38\t022030\n",
"1.000\t39\t022031\n",
"1.000\t40\t022032\n",
"1.000\t41\t022033\n",
"1.000\t42\t022034\n",
"0.887\t43\t021410\n",
"1.000\t44\t022035\n",
"1.000\t45\t022036\n",
"1.000\t46\t022037\n",
"1.000\t47\t021413\n",
"0.959\t48\t022038\n",
"1.000\t49\t022039\n",
"1.000\t50\t021415\n",
"1.000\t51\t022040\n",
"1.000\t52\t022041\n",
"0.793\t53\t021418\n",
"0.807\t54\t021420\n",
"0.693\t55\t021421\n",
"1.000\t56\t021422\n",
"1.000\t57\t021423\n",
"1.000\t58\t022042\n",
"1.000\t59\t022043\n",
"0.805\t60\t021427\n",
"0.957\t61\t021425\n",
"0.770\t62\t021428\n",
"0.970\t63\t022044\n",
"1.000\t64\t022045\n",
"0.738\t65\t021430\n",
"1.000\t66\t022046\n",
"1.000\t67\t022047\n",
"0.792\t68\t021432\n",
"0.793\t69\t021434\n",
"0.721\t70\t021433\n",
"0.811\t71\t021435\n",
"0.728\t72\t021436\n",
"1.000\t73\t021437\n",
"0.989\t74\t021438\n",
"0.848\t75\t021440\n"
"1.000\t1\t004884\n",
"1.000\t2\t003673\n",
"0.903\t3\t000506\n",
"0.984\t4\t003665\n",
"0.921\t5\t012805\n",
"0.949\t6\t011578\n",
"0.987\t7\t011716\n",
"0.987\t8\t011674\n",
"0.867\t9\t000041\n",
"0.939\t10\t002911\n",
"0.876\t11\t011630\n",
"0.785\t12\t002858\n",
"0.990\t13\t011636\n",
"0.848\t14\t011186\n",
"0.763\t15\t011186\n",
"0.662\t16\t003625\n",
"0.971\t17\t011687\n",
"0.910\t18\t011712\n",
"0.685\t19\t003857\n",
"0.887\t20\t012107\n",
"1.000\t21\t011594\n",
"0.803\t22\t040098\n",
"1.000\t23\t011708\n",
"1.000\t24\t011724\n",
"1.000\t25\t011639\n",
"0.819\t26\t012756\n",
"1.000\t27\t011670\n",
"1.000\t28\t011608\n",
"0.995\t29\t011728\n",
"0.780\t30\t001553\n",
"0.876\t31\t009993\n",
"0.948\t32\t003638\n",
"0.628\t33\t003607\n",
"0.702\t34\t012316\n",
"0.686\t35\t022043\n",
"0.671\t36\t021435\n",
"0.957\t37\t011611\n",
"0.551\t38\t040015\n",
"0.935\t39\t012448\n",
"1.000\t40\t011648\n",
"1.000\t41\t011671\n",
"0.665\t42\t000387\n",
"0.777\t43\t003624\n",
"0.993\t44\t003666\n",
"1.000\t45\t012195\n",
"0.942\t46\t009988\n",
"1.000\t47\t011696\n",
"1.000\t48\t011631\n",
"0.602\t49\t004391\n",
"0.879\t50\t010710\n",
"1.000\t51\t011721\n",
"0.903\t52\t013057\n",
"0.799\t53\t009074\n",
"0.984\t54\t011686\n",
"0.975\t55\t011718\n",
"0.993\t56\t003733\n",
"1.000\t57\t000629\n",
"1.000\t58\t011697\n",
"0.998\t59\t011736\n",
"0.994\t60\t011700\n",
"1.000\t61\t003674\n",
"0.632\t62\t007439\n",
"0.986\t63\t012746\n",
"0.850\t64\t000512\n",
"0.684\t65\t012743\n",
"0.590\t66\t009751\n",
"0.800\t67\t010005\n",
"0.730\t68\t013272\n",
"0.975\t69\t004037\n",
"0.614\t70\t010551\n",
"0.867\t71\t013396\n",
"0.875\t72\t012745\n",
"0.994\t73\t012100\n",
"0.680\t74\t012289\n",
"1.000\t75\t021151\n",
"0.557\t76\t013684\n",
"0.527\t77\t031184\n",
"0.683\t78\t000659\n",
"0.795\t79\t012359\n",
"0.698\t80\t000283\n",
"0.761\t81\t009445\n",
"0.849\t82\t008965\n",
"0.618\t83\t005293\n",
"0.589\t84\t005293\n",
"0.785\t85\t020601\n",
"0.613\t86\t031069\n",
"0.775\t87\t009921\n",
"0.700\t88\t001769\n",
"0.636\t89\t010481\n",
"0.703\t90\t012295\n",
"0.734\t91\t011946\n",
"0.670\t92\t012825\n",
"0.728\t93\t020841\n",
"0.686\t94\t009040\n",
"0.549\t95\t009718\n",
"0.554\t96\t004998\n"
]
}
],
@ -91,11 +112,11 @@
"import os,re,difflib,Levenshtein,time,json\n",
"\n",
"# 重要!!! 范围\n",
"old_problems_range = \"21365:21440,22022:22047\"\n",
"old_problems_range = \"1:999999\"\n",
"threshold = 0.85\n",
"\n",
"# 待比对的文件\n",
"filename = r\"D:\\temp\\derivatives.tex\"\n",
"filename = r\"C:\\Users\\weiye\\Documents\\wwy sync\\临时工作区\\空中课堂第六批.tex\"\n",
"\n",
"#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n",
"def generate_number_set(string):\n",
@ -168,7 +189,8 @@
"new_pro_list = GenerateProblemListFromString(newdatabase)\n",
"\n",
"pro_dict_treated = {}\n",
"idrange = generate_number_set(old_problems_range)\n",
"idrange_raw = generate_number_set(old_problems_range)\n",
"idrange = [id for id in pro_dict if id in idrange_raw]\n",
"for p in idrange:\n",
" pro_dict_treated[p] = pre_treating(pro_dict[p][\"content\"])\n",
"\n",
@ -193,22 +215,22 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"75"
"999999"
]
},
"execution_count": 16,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(new_dict_treated)"
"len(idrange)"
]
},
{
@ -235,12 +257,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "42dd566da87765ddbe9b5c5b483063747fec4aacc5469ad554706e4b742e67b2"
"hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc"
}
}
},

View File

@ -2,24 +2,741 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"旧题目数: 12858 , 新题目数: 191\n",
"旧题目数: 0 , 新题目数: 18100\n",
"开始新题与旧题的比对\n",
"50\n",
"100\n",
"150\n",
"200\n",
"250\n",
"300\n",
"350\n",
"400\n",
"450\n",
"500\n",
"550\n",
"600\n",
"650\n",
"700\n",
"750\n",
"800\n",
"850\n",
"900\n",
"950\n",
"1000\n",
"1050\n",
"1100\n",
"1150\n",
"1200\n",
"1250\n",
"1300\n",
"1350\n",
"1400\n",
"1450\n",
"1500\n",
"1550\n",
"1600\n",
"1650\n",
"1700\n",
"1750\n",
"1800\n",
"1850\n",
"1900\n",
"1950\n",
"2000\n",
"2050\n",
"2100\n",
"2150\n",
"2200\n",
"2250\n",
"2300\n",
"2350\n",
"2400\n",
"2450\n",
"2500\n",
"2550\n",
"2600\n",
"2650\n",
"2700\n",
"2750\n",
"2800\n",
"2850\n",
"2900\n",
"2950\n",
"3000\n",
"3050\n",
"3100\n",
"3150\n",
"3200\n",
"3250\n",
"3300\n",
"3350\n",
"3400\n",
"3450\n",
"3500\n",
"3550\n",
"3600\n",
"3650\n",
"3700\n",
"3750\n",
"3800\n",
"3850\n",
"3900\n",
"3950\n",
"4000\n",
"4050\n",
"4100\n",
"4150\n",
"4200\n",
"4250\n",
"4300\n",
"4350\n",
"4400\n",
"4450\n",
"4500\n",
"4550\n",
"4600\n",
"4650\n",
"4700\n",
"4750\n",
"4800\n",
"4850\n",
"4900\n",
"4950\n",
"5000\n",
"5050\n",
"5100\n",
"5150\n",
"5200\n",
"5250\n",
"5300\n",
"5350\n",
"5400\n",
"5450\n",
"5500\n",
"5550\n",
"5600\n",
"5650\n",
"5700\n",
"5750\n",
"5800\n",
"5850\n",
"5900\n",
"5950\n",
"6000\n",
"6050\n",
"6100\n",
"6150\n",
"6200\n",
"6250\n",
"6300\n",
"6350\n",
"6400\n",
"6450\n",
"6500\n",
"6550\n",
"6600\n",
"6650\n",
"6700\n",
"6750\n",
"6800\n",
"6850\n",
"6900\n",
"6950\n",
"7000\n",
"7050\n",
"7100\n",
"7150\n",
"7200\n",
"7250\n",
"7300\n",
"7350\n",
"7400\n",
"7450\n",
"7500\n",
"7550\n",
"7600\n",
"7650\n",
"7700\n",
"7750\n",
"7800\n",
"7850\n",
"7900\n",
"7950\n",
"8000\n",
"8050\n",
"8100\n",
"8150\n",
"8200\n",
"8250\n",
"8300\n",
"8350\n",
"8400\n",
"8450\n",
"8500\n",
"8550\n",
"8600\n",
"8650\n",
"8700\n",
"8750\n",
"8800\n",
"8850\n",
"8900\n",
"8950\n",
"9000\n",
"9050\n",
"9100\n",
"9150\n",
"9200\n",
"9250\n",
"9300\n",
"9350\n",
"9400\n",
"9450\n",
"9500\n",
"9550\n",
"9600\n",
"9650\n",
"9700\n",
"9750\n",
"9800\n",
"9850\n",
"9900\n",
"9950\n",
"10000\n",
"10050\n",
"10100\n",
"10150\n",
"10200\n",
"10250\n",
"10300\n",
"10350\n",
"10400\n",
"10450\n",
"10500\n",
"10550\n",
"10600\n",
"10650\n",
"10700\n",
"10750\n",
"10800\n",
"10850\n",
"10900\n",
"10950\n",
"11000\n",
"11050\n",
"11100\n",
"11150\n",
"11200\n",
"11250\n",
"11300\n",
"11350\n",
"11400\n",
"11450\n",
"11500\n",
"11550\n",
"11600\n",
"11650\n",
"11700\n",
"11750\n",
"11800\n",
"11850\n",
"11900\n",
"11950\n",
"12000\n",
"12050\n",
"12100\n",
"12150\n",
"12200\n",
"12250\n",
"12300\n",
"12350\n",
"12400\n",
"12450\n",
"12500\n",
"12550\n",
"12600\n",
"12650\n",
"12700\n",
"12750\n",
"12800\n",
"12850\n",
"12900\n",
"12950\n",
"13000\n",
"13050\n",
"13100\n",
"13150\n",
"13200\n",
"13250\n",
"13300\n",
"13350\n",
"13400\n",
"13450\n",
"13500\n",
"13550\n",
"13600\n",
"13650\n",
"13700\n",
"13750\n",
"13800\n",
"13850\n",
"13900\n",
"13950\n",
"14000\n",
"14050\n",
"14100\n",
"14150\n",
"14200\n",
"14250\n",
"14300\n",
"14350\n",
"14400\n",
"14450\n",
"14500\n",
"14550\n",
"14600\n",
"14650\n",
"14700\n",
"14750\n",
"14800\n",
"14850\n",
"14900\n",
"14950\n",
"15000\n",
"15050\n",
"15100\n",
"15150\n",
"15200\n",
"15250\n",
"15300\n",
"15350\n",
"15400\n",
"15450\n",
"15500\n",
"15550\n",
"15600\n",
"15650\n",
"15700\n",
"15750\n",
"15800\n",
"15850\n",
"15900\n",
"15950\n",
"16000\n",
"16050\n",
"16100\n",
"16150\n",
"16200\n",
"16250\n",
"16300\n",
"16350\n",
"16400\n",
"16450\n",
"16500\n",
"16550\n",
"16600\n",
"16650\n",
"16700\n",
"16750\n",
"16800\n",
"16850\n",
"16900\n",
"16950\n",
"17000\n",
"17050\n",
"17100\n",
"17150\n",
"17200\n",
"17250\n",
"17300\n",
"17350\n",
"17400\n",
"17450\n",
"17500\n",
"17550\n",
"17600\n",
"17650\n",
"17700\n",
"17750\n",
"17800\n",
"17850\n",
"17900\n",
"17950\n",
"18000\n",
"18050\n",
"18100\n",
"开始新题之间的比对\n",
"50\n",
"100\n",
"150\n",
"总耗时: 79.84281706809998 秒.\n",
"发现相似: 252 , 其中已标注: 0 .\n"
"200\n",
"250\n",
"300\n",
"350\n",
"400\n",
"450\n",
"500\n",
"550\n",
"600\n",
"650\n",
"700\n",
"750\n",
"800\n",
"850\n",
"900\n",
"950\n",
"1000\n",
"1050\n",
"1100\n",
"1150\n",
"1200\n",
"1250\n",
"1300\n",
"1350\n",
"1400\n",
"1450\n",
"1500\n",
"1550\n",
"1600\n",
"1650\n",
"1700\n",
"1750\n",
"1800\n",
"1850\n",
"1900\n",
"1950\n",
"2000\n",
"2050\n",
"2100\n",
"2150\n",
"2200\n",
"2250\n",
"2300\n",
"2350\n",
"2400\n",
"2450\n",
"2500\n",
"2550\n",
"2600\n",
"2650\n",
"2700\n",
"2750\n",
"2800\n",
"2850\n",
"2900\n",
"2950\n",
"3000\n",
"3050\n",
"3100\n",
"3150\n",
"3200\n",
"3250\n",
"3300\n",
"3350\n",
"3400\n",
"3450\n",
"3500\n",
"3550\n",
"3600\n",
"3650\n",
"3700\n",
"3750\n",
"3800\n",
"3850\n",
"3900\n",
"3950\n",
"4000\n",
"4050\n",
"4100\n",
"4150\n",
"4200\n",
"4250\n",
"4300\n",
"4350\n",
"4400\n",
"4450\n",
"4500\n",
"4550\n",
"4600\n",
"4650\n",
"4700\n",
"4750\n",
"4800\n",
"4850\n",
"4900\n",
"4950\n",
"5000\n",
"5050\n",
"5100\n",
"5150\n",
"5200\n",
"5250\n",
"5300\n",
"5350\n",
"5400\n",
"5450\n",
"5500\n",
"5550\n",
"5600\n",
"5650\n",
"5700\n",
"5750\n",
"5800\n",
"5850\n",
"5900\n",
"5950\n",
"6000\n",
"6050\n",
"6100\n",
"6150\n",
"6200\n",
"6250\n",
"6300\n",
"6350\n",
"6400\n",
"6450\n",
"6500\n",
"6550\n",
"6600\n",
"6650\n",
"6700\n",
"6750\n",
"6800\n",
"6850\n",
"6900\n",
"6950\n",
"7000\n",
"7050\n",
"7100\n",
"7150\n",
"7200\n",
"7250\n",
"7300\n",
"7350\n",
"7400\n",
"7450\n",
"7500\n",
"7550\n",
"7600\n",
"7650\n",
"7700\n",
"7750\n",
"7800\n",
"7850\n",
"7900\n",
"7950\n",
"8000\n",
"8050\n",
"8100\n",
"8150\n",
"8200\n",
"8250\n",
"8300\n",
"8350\n",
"8400\n",
"8450\n",
"8500\n",
"8550\n",
"8600\n",
"8650\n",
"8700\n",
"8750\n",
"8800\n",
"8850\n",
"8900\n",
"8950\n",
"9000\n",
"9050\n",
"9100\n",
"9150\n",
"9200\n",
"9250\n",
"9300\n",
"9350\n",
"9400\n",
"9450\n",
"9500\n",
"9550\n",
"9600\n",
"9650\n",
"9700\n",
"9750\n",
"9800\n",
"9850\n",
"9900\n",
"9950\n",
"10000\n",
"10050\n",
"10100\n",
"10150\n",
"10200\n",
"10250\n",
"10300\n",
"10350\n",
"10400\n",
"10450\n",
"10500\n",
"10550\n",
"10600\n",
"10650\n",
"10700\n",
"10750\n",
"10800\n",
"10850\n",
"10900\n",
"10950\n",
"11000\n",
"11050\n",
"11100\n",
"11150\n",
"11200\n",
"11250\n",
"11300\n",
"11350\n",
"11400\n",
"11450\n",
"11500\n",
"11550\n",
"11600\n",
"11650\n",
"11700\n",
"11750\n",
"11800\n",
"11850\n",
"11900\n",
"11950\n",
"12000\n",
"12050\n",
"12100\n",
"12150\n",
"12200\n",
"12250\n",
"12300\n",
"12350\n",
"12400\n",
"12450\n",
"12500\n",
"12550\n",
"12600\n",
"12650\n",
"12700\n",
"12750\n",
"12800\n",
"12850\n",
"12900\n",
"12950\n",
"13000\n",
"13050\n",
"13100\n",
"13150\n",
"13200\n",
"13250\n",
"13300\n",
"13350\n",
"13400\n",
"13450\n",
"13500\n",
"13550\n",
"13600\n",
"13650\n",
"13700\n",
"13750\n",
"13800\n",
"13850\n",
"13900\n",
"13950\n",
"14000\n",
"14050\n",
"14100\n",
"14150\n",
"14200\n",
"14250\n",
"14300\n",
"14350\n",
"14400\n",
"14450\n",
"14500\n",
"14550\n",
"14600\n",
"14650\n",
"14700\n",
"14750\n",
"14800\n",
"14850\n",
"14900\n",
"14950\n",
"15000\n",
"15050\n",
"15100\n",
"15150\n",
"15200\n",
"15250\n",
"15300\n",
"15350\n",
"15400\n",
"15450\n",
"15500\n",
"15550\n",
"15600\n",
"15650\n",
"15700\n",
"15750\n",
"15800\n",
"15850\n",
"15900\n",
"15950\n",
"16000\n",
"16050\n",
"16100\n",
"16150\n",
"16200\n",
"16250\n",
"16300\n",
"16350\n",
"16400\n",
"16450\n",
"16500\n",
"16550\n",
"16600\n",
"16650\n",
"16700\n",
"16750\n",
"16800\n",
"16850\n",
"16900\n",
"16950\n",
"17000\n",
"17050\n",
"17100\n",
"17150\n",
"17200\n",
"17250\n",
"17300\n",
"17350\n",
"17400\n",
"17450\n",
"17500\n",
"17550\n",
"17600\n",
"17650\n",
"17700\n",
"17750\n",
"17800\n",
"17850\n",
"17900\n",
"17950\n",
"18000\n",
"18050\n",
"总耗时: 384.6457269191742 秒.\n",
"发现相似: 2800 , 其中已标注: 2223 .\n"
]
}
],
@ -28,8 +745,8 @@
"import os,re,difflib,Levenshtein,time,json\n",
"\n",
"# 重要!!! 新题目的范围\n",
"id_new_problems = \"12138:12328\"\n",
"threshold = 0.85\n",
"id_new_problems = \"1:50000\"\n",
"threshold = 0.99\n",
"\n",
"#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n",
"def generate_number_set(string):\n",
@ -57,20 +774,20 @@
"\n",
"#difflab字符串比较\n",
"def difflab_get_equal_rate(str1, str2):\n",
" str1 = pre_treating(str1)\n",
" str2 = pre_treating(str2)\n",
" # str1 = pre_treating(str1)\n",
" # str2 = pre_treating(str2)\n",
" return difflib.SequenceMatcher(None, str1, str2).ratio()\n",
"\n",
"#Levenshtein jaro字符串比较\n",
"def jaro_get_equal_rate(str1,str2):\n",
" str1 = pre_treating(str1)\n",
" str2 = pre_treating(str2)\n",
" # str1 = pre_treating(str1)\n",
" # str2 = pre_treating(str2)\n",
" return Levenshtein.jaro(str1,str2)\n",
"\n",
"#Levenshtein 字符串比较\n",
"def Lev_get_equal_rate(str1,str2):\n",
" str1 = pre_treating(str1)\n",
" str2 = pre_treating(str2)\n",
" # str1 = pre_treating(str1)\n",
" # str2 = pre_treating(str2)\n",
" return Levenshtein.ratio(str1,str2)\n",
"\n",
"\n",
@ -85,14 +802,19 @@
"pro_dict = json.loads(database)\n",
"\n",
"#生成旧题目数据库字典与新题目数据库字典\n",
"new_id_list = generate_number_set(id_new_problems)\n",
"new_id_list_raw = generate_number_set(id_new_problems)\n",
"new_id_list = [id for id in pro_dict if id in new_id_list_raw]\n",
"old_problems_dict = {}\n",
"new_problems_dict = {}\n",
"old_problems_dict_content = {}\n",
"new_problems_dict_content = {}\n",
"for id in pro_dict:\n",
" if id in new_id_list:\n",
" new_problems_dict[id] = pro_dict[id]\n",
" new_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n",
" else:\n",
" old_problems_dict[id] = pro_dict[id]\n",
" old_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n",
"print(\"旧题目数:\",len(old_problems_dict),\", 新题目数:\",len(new_problems_dict))\n",
"\n",
"#记录起始时间\n",
@ -102,6 +824,8 @@
"\n",
"alike_problems = \"\"\n",
"\n",
"\n",
"\n",
"#开始新题与旧题的比对\n",
"count = 0\n",
"print(\"开始新题与旧题的比对\")\n",
@ -110,7 +834,7 @@
" if count % 50 == 0:\n",
" print(count)\n",
" for id_old in old_problems_dict:\n",
" similar_rate = sim_test(new_problems_dict[id_new][\"content\"],old_problems_dict[id_old][\"content\"])\n",
" similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old])\n",
" if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n",
" suspect_count += 1\n",
" if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n",
@ -127,8 +851,9 @@
" print(count)\n",
" keys = list(new_problems_dict.keys())\n",
" current_problem = new_problems_dict.pop(keys[0])\n",
" current_problem_content = new_problems_dict_content[current_problem[\"id\"]]\n",
" for id_new in new_problems_dict:\n",
" similar_rate = sim_test(new_problems_dict[id_new][\"content\"],current_problem[\"content\"])\n",
" similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content)\n",
" if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n",
" suspect_count += 1\n",
" if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n",
@ -143,7 +868,7 @@
"print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n",
"\n",
"with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n",
" f.write(alike_problems)\n"
" f.write(alike_problems)"
]
},
{
@ -156,7 +881,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.13 ('base')",
"display_name": "mathdept",
"language": "python",
"name": "python3"
},
@ -170,12 +895,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]"
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186"
"hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc"
}
}
},