272 lines
8.2 KiB
Plaintext
272 lines
8.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1.000\t1\t004884\n",
|
|
"1.000\t2\t003673\n",
|
|
"0.903\t3\t000506\n",
|
|
"0.984\t4\t003665\n",
|
|
"0.921\t5\t012805\n",
|
|
"0.949\t6\t011578\n",
|
|
"0.987\t7\t011716\n",
|
|
"0.987\t8\t011674\n",
|
|
"0.867\t9\t000041\n",
|
|
"0.939\t10\t002911\n",
|
|
"0.876\t11\t011630\n",
|
|
"0.785\t12\t002858\n",
|
|
"0.990\t13\t011636\n",
|
|
"0.848\t14\t011186\n",
|
|
"0.763\t15\t011186\n",
|
|
"0.662\t16\t003625\n",
|
|
"0.971\t17\t011687\n",
|
|
"0.910\t18\t011712\n",
|
|
"0.685\t19\t003857\n",
|
|
"0.887\t20\t012107\n",
|
|
"1.000\t21\t011594\n",
|
|
"0.803\t22\t040098\n",
|
|
"1.000\t23\t011708\n",
|
|
"1.000\t24\t011724\n",
|
|
"1.000\t25\t011639\n",
|
|
"0.819\t26\t012756\n",
|
|
"1.000\t27\t011670\n",
|
|
"1.000\t28\t011608\n",
|
|
"0.995\t29\t011728\n",
|
|
"0.780\t30\t001553\n",
|
|
"0.876\t31\t009993\n",
|
|
"0.948\t32\t003638\n",
|
|
"0.628\t33\t003607\n",
|
|
"0.702\t34\t012316\n",
|
|
"0.686\t35\t022043\n",
|
|
"0.671\t36\t021435\n",
|
|
"0.957\t37\t011611\n",
|
|
"0.551\t38\t040015\n",
|
|
"0.935\t39\t012448\n",
|
|
"1.000\t40\t011648\n",
|
|
"1.000\t41\t011671\n",
|
|
"0.665\t42\t000387\n",
|
|
"0.777\t43\t003624\n",
|
|
"0.993\t44\t003666\n",
|
|
"1.000\t45\t012195\n",
|
|
"0.942\t46\t009988\n",
|
|
"1.000\t47\t011696\n",
|
|
"1.000\t48\t011631\n",
|
|
"0.602\t49\t004391\n",
|
|
"0.879\t50\t010710\n",
|
|
"1.000\t51\t011721\n",
|
|
"0.903\t52\t013057\n",
|
|
"0.799\t53\t009074\n",
|
|
"0.984\t54\t011686\n",
|
|
"0.975\t55\t011718\n",
|
|
"0.993\t56\t003733\n",
|
|
"1.000\t57\t000629\n",
|
|
"1.000\t58\t011697\n",
|
|
"0.998\t59\t011736\n",
|
|
"0.994\t60\t011700\n",
|
|
"1.000\t61\t003674\n",
|
|
"0.632\t62\t007439\n",
|
|
"0.986\t63\t012746\n",
|
|
"0.850\t64\t000512\n",
|
|
"0.684\t65\t012743\n",
|
|
"0.590\t66\t009751\n",
|
|
"0.800\t67\t010005\n",
|
|
"0.730\t68\t013272\n",
|
|
"0.975\t69\t004037\n",
|
|
"0.614\t70\t010551\n",
|
|
"0.867\t71\t013396\n",
|
|
"0.875\t72\t012745\n",
|
|
"0.994\t73\t012100\n",
|
|
"0.680\t74\t012289\n",
|
|
"1.000\t75\t021151\n",
|
|
"0.557\t76\t013684\n",
|
|
"0.527\t77\t031184\n",
|
|
"0.683\t78\t000659\n",
|
|
"0.795\t79\t012359\n",
|
|
"0.698\t80\t000283\n",
|
|
"0.761\t81\t009445\n",
|
|
"0.849\t82\t008965\n",
|
|
"0.618\t83\t005293\n",
|
|
"0.589\t84\t005293\n",
|
|
"0.785\t85\t020601\n",
|
|
"0.613\t86\t031069\n",
|
|
"0.775\t87\t009921\n",
|
|
"0.700\t88\t001769\n",
|
|
"0.636\t89\t010481\n",
|
|
"0.703\t90\t012295\n",
|
|
"0.734\t91\t011946\n",
|
|
"0.670\t92\t012825\n",
|
|
"0.728\t93\t020841\n",
|
|
"0.686\t94\t009040\n",
|
|
"0.549\t95\t009718\n",
|
|
"0.554\t96\t004998\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import os,re,difflib,Levenshtein,time,json\n",
|
|
"\n",
|
|
"# 重要!!! 范围\n",
|
|
"old_problems_range = \"1:999999\"\n",
|
|
"threshold = 0.85\n",
|
|
"\n",
|
|
"# 待比对的文件\n",
|
|
"filename = r\"C:\\Users\\weiye\\Documents\\wwy sync\\临时工作区\\空中课堂第六批.tex\"\n",
|
|
"\n",
|
|
"#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n",
|
|
"def generate_number_set(string):\n",
|
|
" string = re.sub(r\"[\\n\\s]\",\"\",string)\n",
|
|
" string_list = string.split(\",\")\n",
|
|
" numbers_list = []\n",
|
|
" for s in string_list:\n",
|
|
" if not \":\" in s:\n",
|
|
" numbers_list.append(s.zfill(6))\n",
|
|
" else:\n",
|
|
" start,end = s.split(\":\")\n",
|
|
" for ind in range(int(start),int(end)+1):\n",
|
|
" numbers_list.append(str(ind).zfill(6))\n",
|
|
" return numbers_list\n",
|
|
"\n",
|
|
"#字符串预处理\n",
|
|
"def pre_treating(string):\n",
|
|
" string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n",
|
|
" string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)|(mathrm)|(text)\",\"\",string)\n",
|
|
" string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n",
|
|
" string = re.sub(r\"[\\n\\t]\",\"\",string)\n",
|
|
" string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n",
|
|
" string = re.sub(r\"[,\\.:;?]\",\"\",string)\n",
|
|
" return string\n",
|
|
"\n",
|
|
"#difflab字符串比较\n",
|
|
"def difflab_get_equal_rate(str1, str2):\n",
|
|
" return difflib.SequenceMatcher(None, str1, str2).ratio()\n",
|
|
"\n",
|
|
"#Levenshtein jaro字符串比较\n",
|
|
"def jaro_get_equal_rate(str1,str2):\n",
|
|
" return Levenshtein.jaro(str1,str2)\n",
|
|
"\n",
|
|
"#Levenshtein 字符串比较\n",
|
|
"def Lev_get_equal_rate(str1,str2):\n",
|
|
" return Levenshtein.ratio(str1,str2)\n",
|
|
"\n",
|
|
"def GenerateProblemListFromString(data):\n",
|
|
" try:\n",
|
|
" data = re.findall(r\"\\\\begin\\{document\\}([\\s\\S]*?)\\\\end\\{document\\}\",problems_string)[0]\n",
|
|
" except:\n",
|
|
" pass\n",
|
|
" data = re.sub(r\"\\n{2,}\",\"\\n\",data)\n",
|
|
" data = re.sub(r\"\\\\item\",r\"\\\\enditem\\\\item\",data)\n",
|
|
" data = re.sub(r\"\\\\end\\{enumerate\\}\",r\"\\\\enditem\",data)\n",
|
|
" ProblemList_raw = [p.strip() for p in re.findall(r\"\\\\item([\\s\\S]*?)\\\\enditem\",data)]\n",
|
|
" ProblemsList = []\n",
|
|
" for p in ProblemList_raw:\n",
|
|
" startpos = data.index(p)\n",
|
|
" tempdata = data[:startpos]\n",
|
|
" suflist = re.findall(r\"\\n\\%[\\dA-Za-z]+\",tempdata)\n",
|
|
" if len(suflist) > 0:\n",
|
|
" suffix = suflist[-1].replace(\"%\",\"\").strip()\n",
|
|
" else:\n",
|
|
" suffix = \"\"\n",
|
|
" ProblemsList.append((p,suffix))\n",
|
|
" return ProblemsList\n",
|
|
"\n",
|
|
"\n",
|
|
"#指定对比方法\n",
|
|
"sim_test = jaro_get_equal_rate\n",
|
|
"\n",
|
|
"#读入题库\n",
|
|
"with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n",
|
|
" database = f.read()\n",
|
|
"pro_dict = json.loads(database)\n",
|
|
"\n",
|
|
"with open(filename,\"r\",encoding=\"u8\") as f:\n",
|
|
" newdatabase = f.read()\n",
|
|
"new_pro_list = GenerateProblemListFromString(newdatabase)\n",
|
|
"\n",
|
|
"pro_dict_treated = {}\n",
|
|
"idrange_raw = generate_number_set(old_problems_range)\n",
|
|
"idrange = [id for id in pro_dict if id in idrange_raw]\n",
|
|
"for p in idrange:\n",
|
|
" pro_dict_treated[p] = pre_treating(pro_dict[p][\"content\"])\n",
|
|
"\n",
|
|
"new_dict_treated = {}\n",
|
|
"for i in range(len(new_pro_list)):\n",
|
|
" new_dict_treated[i+1] = pre_treating(new_pro_list[i][0])\n",
|
|
"\n",
|
|
"for i in new_dict_treated:\n",
|
|
" new_p = new_dict_treated[i]\n",
|
|
" maxsim = 0\n",
|
|
" for p in pro_dict_treated:\n",
|
|
" old_p = pro_dict_treated[p]\n",
|
|
" sim = sim_test(new_p,old_p)\n",
|
|
" if sim > maxsim:\n",
|
|
" maxsim = sim\n",
|
|
" argmax = p\n",
|
|
" print(\"%.3f\\t%d\\t%s\" %(maxsim,i,argmax))\n",
|
|
" # print(\"\\n新题: %s\" %new_pro_list[i-1][0])\n",
|
|
" # print(\"\\n原题: %s\\n\\n\\n\" %pro_dict[][\"content\"])\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"999999"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"len(idrange)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "mathdept",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.15"
|
|
},
|
|
"orig_nbformat": 4,
|
|
"vscode": {
|
|
"interpreter": {
|
|
"hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc"
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|