This repository has been archived on 2024-06-23. You can view files and clone it, but cannot push or open issues or pull requests.
mathdeptv2/工具/新题比对.ipynb

250 lines
7.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.000\t1\t021365\n",
"1.000\t2\t021366\n",
"1.000\t3\t021367\n",
"1.000\t4\t021368\n",
"0.944\t5\t021369\n",
"0.947\t6\t021370\n",
"0.917\t7\t021372\n",
"1.000\t8\t021371\n",
"1.000\t9\t021373\n",
"1.000\t10\t021374\n",
"1.000\t11\t021375\n",
"1.000\t12\t021376\n",
"0.966\t13\t021377\n",
"1.000\t14\t022022\n",
"0.902\t15\t021379\n",
"0.865\t16\t021382\n",
"1.000\t17\t022023\n",
"0.987\t18\t021383\n",
"1.000\t19\t021384\n",
"1.000\t20\t021385\n",
"1.000\t21\t021386\n",
"1.000\t22\t021387\n",
"1.000\t23\t021389\n",
"1.000\t24\t022024\n",
"1.000\t25\t021390\n",
"1.000\t26\t022026\n",
"0.891\t27\t021392\n",
"0.965\t28\t021393\n",
"0.986\t29\t021394\n",
"0.940\t30\t021395\n",
"1.000\t31\t021396\n",
"1.000\t32\t022027\n",
"1.000\t33\t021397\n",
"1.000\t34\t022028\n",
"0.805\t35\t021401\n",
"1.000\t36\t021403\n",
"1.000\t37\t022029\n",
"1.000\t38\t022030\n",
"1.000\t39\t022031\n",
"1.000\t40\t022032\n",
"1.000\t41\t022033\n",
"1.000\t42\t022034\n",
"0.887\t43\t021410\n",
"1.000\t44\t022035\n",
"1.000\t45\t022036\n",
"1.000\t46\t022037\n",
"1.000\t47\t021413\n",
"0.959\t48\t022038\n",
"1.000\t49\t022039\n",
"1.000\t50\t021415\n",
"1.000\t51\t022040\n",
"1.000\t52\t022041\n",
"0.793\t53\t021418\n",
"0.807\t54\t021420\n",
"0.693\t55\t021421\n",
"1.000\t56\t021422\n",
"1.000\t57\t021423\n",
"1.000\t58\t022042\n",
"1.000\t59\t022043\n",
"0.805\t60\t021427\n",
"0.957\t61\t021425\n",
"0.770\t62\t021428\n",
"0.970\t63\t022044\n",
"1.000\t64\t022045\n",
"0.738\t65\t021430\n",
"1.000\t66\t022046\n",
"1.000\t67\t022047\n",
"0.792\t68\t021432\n",
"0.793\t69\t021434\n",
"0.721\t70\t021433\n",
"0.811\t71\t021435\n",
"0.728\t72\t021436\n",
"1.000\t73\t021437\n",
"0.989\t74\t021438\n",
"0.848\t75\t021440\n"
]
}
],
"source": [
"import os,re,difflib,Levenshtein,time,json\n",
"\n",
"# 重要!!! 范围\n",
"old_problems_range = \"21365:21440,22022:22047\"\n",
"threshold = 0.85\n",
"\n",
"# 待比对的文件\n",
"filename = r\"D:\\temp\\derivatives.tex\"\n",
"\n",
"#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n",
"def generate_number_set(string):\n",
" string = re.sub(r\"[\\n\\s]\",\"\",string)\n",
" string_list = string.split(\",\")\n",
" numbers_list = []\n",
" for s in string_list:\n",
" if not \":\" in s:\n",
" numbers_list.append(s.zfill(6))\n",
" else:\n",
" start,end = s.split(\":\")\n",
" for ind in range(int(start),int(end)+1):\n",
" numbers_list.append(str(ind).zfill(6))\n",
" return numbers_list\n",
"\n",
"#字符串预处理\n",
"def pre_treating(string):\n",
" string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n",
" string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)|(mathrm)|(text)\",\"\",string)\n",
" string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n",
" string = re.sub(r\"[\\n\\t]\",\"\",string)\n",
" string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n",
" string = re.sub(r\"[,\\.:;?]\",\"\",string)\n",
" return string\n",
"\n",
"#difflab字符串比较\n",
"def difflab_get_equal_rate(str1, str2):\n",
" return difflib.SequenceMatcher(None, str1, str2).ratio()\n",
"\n",
"#Levenshtein jaro字符串比较\n",
"def jaro_get_equal_rate(str1,str2):\n",
" return Levenshtein.jaro(str1,str2)\n",
"\n",
"#Levenshtein 字符串比较\n",
"def Lev_get_equal_rate(str1,str2):\n",
" return Levenshtein.ratio(str1,str2)\n",
"\n",
"def GenerateProblemListFromString(data):\n",
" try:\n",
" data = re.findall(r\"\\\\begin\\{document\\}([\\s\\S]*?)\\\\end\\{document\\}\",problems_string)[0]\n",
" except:\n",
" pass\n",
" data = re.sub(r\"\\n{2,}\",\"\\n\",data)\n",
" data = re.sub(r\"\\\\item\",r\"\\\\enditem\\\\item\",data)\n",
" data = re.sub(r\"\\\\end\\{enumerate\\}\",r\"\\\\enditem\",data)\n",
" ProblemList_raw = [p.strip() for p in re.findall(r\"\\\\item([\\s\\S]*?)\\\\enditem\",data)]\n",
" ProblemsList = []\n",
" for p in ProblemList_raw:\n",
" startpos = data.index(p)\n",
" tempdata = data[:startpos]\n",
" suflist = re.findall(r\"\\n\\%[\\dA-Za-z]+\",tempdata)\n",
" if len(suflist) > 0:\n",
" suffix = suflist[-1].replace(\"%\",\"\").strip()\n",
" else:\n",
" suffix = \"\"\n",
" ProblemsList.append((p,suffix))\n",
" return ProblemsList\n",
"\n",
"\n",
"#指定对比方法\n",
"sim_test = jaro_get_equal_rate\n",
"\n",
"#读入题库\n",
"with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n",
" database = f.read()\n",
"pro_dict = json.loads(database)\n",
"\n",
"with open(filename,\"r\",encoding=\"u8\") as f:\n",
" newdatabase = f.read()\n",
"new_pro_list = GenerateProblemListFromString(newdatabase)\n",
"\n",
"pro_dict_treated = {}\n",
"idrange = generate_number_set(old_problems_range)\n",
"for p in idrange:\n",
" pro_dict_treated[p] = pre_treating(pro_dict[p][\"content\"])\n",
"\n",
"new_dict_treated = {}\n",
"for i in range(len(new_pro_list)):\n",
" new_dict_treated[i+1] = pre_treating(new_pro_list[i][0])\n",
"\n",
"for i in new_dict_treated:\n",
" new_p = new_dict_treated[i]\n",
" maxsim = 0\n",
" for p in pro_dict_treated:\n",
" old_p = pro_dict_treated[p]\n",
" sim = sim_test(new_p,old_p)\n",
" if sim > maxsim:\n",
" maxsim = sim\n",
" argmax = p\n",
" print(\"%.3f\\t%d\\t%s\" %(maxsim,i,argmax))\n",
" # print(\"\\n新题: %s\" %new_pro_list[i-1][0])\n",
" # print(\"\\n原题: %s\\n\\n\\n\" %pro_dict[][\"content\"])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"75"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(new_dict_treated)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "mathdept",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "42dd566da87765ddbe9b5c5b483063747fec4aacc5469ad554706e4b742e67b2"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}