This repository has been archived on 2024-06-23. You can view files and clone it, but cannot push or open issues or pull requests.
mathdeptv2/工具/相同题目检测.ipynb

189 lines
5.3 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"题目数: 18795\n",
"500\n",
"1000\n",
"1500\n",
"2000\n",
"2500\n",
"3000\n",
"3500\n",
"4000\n",
"4500\n",
"5000\n",
"5500\n",
"6000\n",
"6500\n",
"7000\n",
"7500\n",
"8000\n",
"8500\n",
"9000\n",
"9500\n",
"10000\n",
"10500\n",
"11000\n",
"11500\n",
"12000\n",
"12500\n",
"13000\n",
"13500\n",
"14000\n",
"14500\n",
"15000\n",
"15500\n",
"16000\n",
"16500\n",
"17000\n",
"17500\n",
"18000\n",
"耗时: 448.506秒\n"
]
}
],
"source": [
"import os,re,difflib,Levenshtein,time,json\n",
"\n",
"# 相同题目的阈值\n",
"threshold = 0.99\n",
"\n",
"outputfile = r\"临时文件/相同题目列表.txt\"\n",
"\n",
"#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n",
"def generate_number_set(string):\n",
" string = re.sub(r\"[\\n\\s]\",\"\",string)\n",
" string_list = string.split(\",\")\n",
" numbers_list = []\n",
" for s in string_list:\n",
" if not \":\" in s:\n",
" numbers_list.append(s.zfill(6))\n",
" else:\n",
" start,end = s.split(\":\")\n",
" for ind in range(int(start),int(end)+1):\n",
" numbers_list.append(str(ind).zfill(6))\n",
" return numbers_list\n",
"\n",
"#字符串预处理\n",
"def pre_treating(string):\n",
" string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n",
" string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)\",\"\",string)\n",
" string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n",
" string = re.sub(r\"[\\n\\t]\",\"\",string)\n",
" string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n",
" string = re.sub(r\"[,\\.:;?]\",\"\",string)\n",
" return string\n",
"\n",
"#difflab字符串比较\n",
"def difflab_get_equal_rate(str1, str2):\n",
" # str1 = pre_treating(str1)\n",
" # str2 = pre_treating(str2)\n",
" return difflib.SequenceMatcher(None, str1, str2).ratio()\n",
"\n",
"#Levenshtein jaro字符串比较\n",
"def jaro_get_equal_rate(str1,str2):\n",
" # str1 = pre_treating(str1)\n",
" # str2 = pre_treating(str2)\n",
" return Levenshtein.jaro(str1,str2)\n",
"\n",
"#Levenshtein 字符串比较\n",
"def Lev_get_equal_rate(str1,str2):\n",
" # str1 = pre_treating(str1)\n",
" # str2 = pre_treating(str2)\n",
" return Levenshtein.ratio(str1,str2)\n",
"\n",
"\n",
"\n",
"\n",
"#指定对比方法\n",
"sim_test = jaro_get_equal_rate\n",
"\n",
"#读入题库\n",
"with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n",
" database = f.read()\n",
"pro_dict = json.loads(database)\n",
"\n",
"pro_dict_treated = {}\n",
"for id in pro_dict:\n",
" pro_dict_treated[id] = pro_dict[id].copy()\n",
" pro_dict_treated[id][\"content\"] = pre_treating(pro_dict_treated[id][\"content\"])\n",
"\n",
"\n",
"print(\"题目数:\",len(pro_dict))\n",
"\n",
"#记录起始时间\n",
"starttime = time.time()\n",
"alike_problems = \"\"\n",
"\n",
"\n",
"count = 0\n",
"keys = list(pro_dict_treated.keys())\n",
"while len(keys) >= 2:\n",
" count += 1\n",
" if count % 500 == 0:\n",
" print(count)\n",
" \n",
" currentid = keys.pop(0)\n",
" content1 = pro_dict_treated[currentid][\"content\"]\n",
" same = []\n",
" for id in keys:\n",
" if not id in pro_dict[currentid][\"same\"] and not id in pro_dict[currentid][\"related\"]:\n",
" content2 = pro_dict_treated[id][\"content\"]\n",
" if sim_test(content1,content2)>threshold:\n",
" same.append(id)\n",
" if len(same) >= 1:\n",
" # print(currentid)\n",
" alike_problems += currentid + \",\"\n",
" for i in same:\n",
" # print(i)\n",
" keys.pop(keys.index(i))\n",
" alike_problems += \",\".join(same)\n",
" alike_problems += \"\\n\\n\"\n",
"\n",
"endtime = time.time()\n",
"print(\"耗时: %.3f秒\" %(endtime-starttime))\n",
"\n",
"with open(outputfile,\"w\",encoding = \"u8\") as f:\n",
" f.write(alike_problems)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "mathdept",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}