This repository has been archived on 2024-06-23. You can view files and clone it, but cannot push or open issues or pull requests.
mathdeptv2/工具/相似题目检测.ipynb

260 lines
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"旧题目数: 1907 , 新题目数: 500\n",
"开始新题与旧题的比对\n",
"50\n",
"100\n",
"150\n",
"200\n",
"250\n",
"300\n",
"350\n",
"400\n",
"450\n",
"500\n",
"开始新题之间的比对\n",
"50\n",
"100\n",
"150\n",
"200\n",
"250\n",
"300\n",
"350\n",
"400\n",
"450\n",
"总耗时: 3.233793258666992 秒.\n",
"发现相似: 30 , 其中已标注: 28 .\n"
]
}
],
"source": [
"import os,re,difflib,Levenshtein,time,json\n",
"\n",
"# 重要!!! 新旧题目的范围(有重复默认为新题)\n",
"id_new_problems = \"1:500\"\n",
"id_old_problems = \"30000:50000\"\n",
"threshold = 0.95\n",
"\n",
"#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n",
"def generate_number_set(string):\n",
" string = re.sub(r\"[\\n\\s]\",\"\",string)\n",
" string_list = string.split(\",\")\n",
" numbers_list = []\n",
" for s in string_list:\n",
" if not \":\" in s:\n",
" numbers_list.append(s.zfill(6))\n",
" else:\n",
" start,end = s.split(\":\")\n",
" for ind in range(int(start),int(end)+1):\n",
" numbers_list.append(str(ind).zfill(6))\n",
" return numbers_list\n",
"\n",
"#字符串预处理\n",
"def pre_treating(string):\n",
" string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n",
" string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)\",\"\",string)\n",
" string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n",
" string = re.sub(r\"[\\n\\t]\",\"\",string)\n",
" string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n",
" string = re.sub(r\"[,\\.:;?]\",\"\",string)\n",
" return string\n",
"\n",
"#difflab字符串比较\n",
"def difflab_get_equal_rate(str1, str2):\n",
" # str1 = pre_treating(str1)\n",
" # str2 = pre_treating(str2)\n",
" return difflib.SequenceMatcher(None, str1, str2).ratio()\n",
"\n",
"#Levenshtein jaro字符串比较\n",
"def jaro_get_equal_rate(str1,str2):\n",
" # str1 = pre_treating(str1)\n",
" # str2 = pre_treating(str2)\n",
" return Levenshtein.jaro(str1,str2)\n",
"\n",
"#Levenshtein 字符串比较\n",
"def Lev_get_equal_rate(str1,str2):\n",
" # str1 = pre_treating(str1)\n",
" # str2 = pre_treating(str2)\n",
" return Levenshtein.ratio(str1,str2)\n",
"\n",
"\n",
"\n",
"\n",
"#指定对比方法\n",
"sim_test = jaro_get_equal_rate\n",
"\n",
"#读入题库\n",
"with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n",
" database = f.read()\n",
"pro_dict = json.loads(database)\n",
"\n",
"#生成旧题目数据库字典与新题目数据库字典\n",
"new_id_list_raw = generate_number_set(id_new_problems)\n",
"new_id_list = [id for id in pro_dict if id in new_id_list_raw]\n",
"old_id_list_raw = generate_number_set(id_old_problems)\n",
"old_id_list = [id for id in pro_dict if (id in old_id_list_raw and not id in new_id_list_raw)]\n",
"old_problems_dict = {}\n",
"new_problems_dict = {}\n",
"old_problems_dict_content = {}\n",
"new_problems_dict_content = {}\n",
"for id in new_id_list:\n",
" new_problems_dict[id] = pro_dict[id]\n",
" new_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n",
"for id in old_id_list:\n",
" old_problems_dict[id] = pro_dict[id]\n",
" old_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n",
"print(\"旧题目数:\",len(old_problems_dict),\", 新题目数:\",len(new_problems_dict))\n",
"\n",
"#记录起始时间\n",
"start_time = time.time()\n",
"suspect_count = 0\n",
"remarked = 0\n",
"\n",
"alike_problems = \"\"\n",
"\n",
"\n",
"\n",
"#开始新题与旧题的比对\n",
"count = 0\n",
"print(\"开始新题与旧题的比对\")\n",
"for id_new in new_problems_dict:\n",
" count += 1\n",
" if count % 50 == 0:\n",
" print(count)\n",
" for id_old in old_problems_dict:\n",
" similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old])\n",
" if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n",
" suspect_count += 1\n",
" if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n",
" alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + id_old + \" \" + old_problems_dict[id_old][\"content\"] + \"\\n\\n\"\n",
" else:\n",
" remarked += 1\n",
"\n",
"#开始新题之间的比对\n",
"count = 0\n",
"print(\"开始新题之间的比对\")\n",
"while len(new_problems_dict) >= 2:\n",
" count += 1\n",
" if count % 50 == 0:\n",
" print(count)\n",
" keys = list(new_problems_dict.keys())\n",
" current_problem = new_problems_dict.pop(keys[0])\n",
" current_problem_content = new_problems_dict_content[current_problem[\"id\"]]\n",
" for id_new in new_problems_dict:\n",
" similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content)\n",
" if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n",
" suspect_count += 1\n",
" if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n",
" alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + current_problem[\"id\"] + \" \" + current_problem[\"content\"] + \"\\n\\n\"\n",
" else:\n",
" remarked += 1\n",
"\n",
"\n",
"#记录终止时间及显示结果\n",
"end_time = time.time()\n",
"print(\"总耗时:\",end_time-start_time,\"秒.\")\n",
"print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n",
"\n",
"with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n",
" f.write(alike_problems)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#记录起始时间\n",
"start_time = time.time()\n",
"suspect_count = 0\n",
"remarked = 0\n",
"\n",
"alike_problems = \"\"\n",
"\n",
"\n",
"\n",
"#开始新题与旧题的比对\n",
"count = 0\n",
"print(\"开始新题与旧题的比对\")\n",
"for id_new in new_problems_dict:\n",
" count += 1\n",
" if count % 50 == 0:\n",
" print(count)\n",
" for id_old in old_problems_dict:\n",
" similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old])\n",
" if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n",
" suspect_count += 1\n",
" if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n",
" alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + id_old + \" \" + old_problems_dict[id_old][\"content\"] + \"\\n\\n\"\n",
" else:\n",
" remarked += 1\n",
"\n",
"#开始新题之间的比对\n",
"count = 0\n",
"print(\"开始新题之间的比对\")\n",
"while len(new_problems_dict) >= 2:\n",
" count += 1\n",
" if count % 50 == 0:\n",
" print(count)\n",
" keys = list(new_problems_dict.keys())\n",
" current_problem = new_problems_dict.pop(keys[0])\n",
" current_problem_content = new_problems_dict_content[current_problem[\"id\"]]\n",
" for id_new in new_problems_dict:\n",
" similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content)\n",
" if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n",
" suspect_count += 1\n",
" if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n",
" alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + current_problem[\"id\"] + \" \" + current_problem[\"content\"] + \"\\n\\n\"\n",
" else:\n",
" remarked += 1\n",
"\n",
"\n",
"#记录终止时间及显示结果\n",
"end_time = time.time()\n",
"print(\"总耗时:\",end_time-start_time,\"秒.\")\n",
"print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n",
"\n",
"with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n",
" f.write(alike_problems)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "mathdept",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}