{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.789\t1\t005886\n", "0.805\t2\t006207\n", "0.812\t3\t006137\n", "0.798\t4\t012269\n", "0.817\t5\t010989\n", "0.888\t6\t021503\n", "0.815\t7\t021484\n", "0.959\t8\t003206\n", "0.839\t9\t011090\n" ] } ], "source": [ "import os,re,difflib,Levenshtein,time,json\n", "\n", "# 重要!!! 范围\n", "old_problems_range = \"1:30000\"\n", "threshold = 0.85\n", "\n", "# 待比对的文件\n", "filename = r\"C:\\Users\\weiye\\Documents\\wwy sync\\临时工作区\\自拟题目9.tex\"\n", "\n", "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n", "def generate_number_set(string):\n", " string = re.sub(r\"[\\n\\s]\",\"\",string)\n", " string_list = string.split(\",\")\n", " numbers_list = []\n", " for s in string_list:\n", " if not \":\" in s:\n", " numbers_list.append(s.zfill(6))\n", " else:\n", " start,end = s.split(\":\")\n", " for ind in range(int(start),int(end)+1):\n", " numbers_list.append(str(ind).zfill(6))\n", " return numbers_list\n", "\n", "#字符串预处理\n", "def pre_treating(string):\n", " string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n", " string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)|(mathrm)|(text)\",\"\",string)\n", " string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n", " string = re.sub(r\"[\\n\\t]\",\"\",string)\n", " string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n", " string = re.sub(r\"[,\\.:;?]\",\"\",string)\n", " return string\n", "\n", "#difflab字符串比较\n", "def difflab_get_equal_rate(str1, str2):\n", " return difflib.SequenceMatcher(None, str1, str2).ratio()\n", "\n", "#Levenshtein jaro字符串比较\n", "def jaro_get_equal_rate(str1,str2):\n", " return Levenshtein.jaro(str1,str2)\n", "\n", "#Levenshtein 字符串比较\n", "def Lev_get_equal_rate(str1,str2):\n", " return Levenshtein.ratio(str1,str2)\n", "\n", "def GenerateProblemListFromString(problem_string):\n", " try:\n", " data = re.findall(r\"\\\\begin\\{document\\}([\\s\\S]*?)\\\\end\\{document\\}\",problem_string)[0]\n", " except:\n", " data = problem_string\n", " data = re.sub(r\"\\n{2,}\",\"\\n\",data)\n", " data = re.sub(r\"\\\\item\",r\"\\\\enditem\\\\item\",data)\n", " data = re.sub(r\"\\\\end\\{enumerate\\}\",r\"\\\\enditem\",data)\n", " ProblemList_raw = [p.strip() for p in re.findall(r\"\\\\item([\\s\\S]*?)\\\\enditem\",data)]\n", " ProblemsList = []\n", " for p in ProblemList_raw:\n", " startpos = data.index(p)\n", " tempdata = data[:startpos]\n", " suflist = re.findall(r\"\\n\\%[\\dA-Za-z]+\",tempdata)\n", " if len(suflist) > 0:\n", " suffix = suflist[-1].replace(\"%\",\"\").strip()\n", " else:\n", " suffix = \"\"\n", " ProblemsList.append((p,suffix))\n", " return ProblemsList\n", "\n", "\n", "#指定对比方法\n", "sim_test = jaro_get_equal_rate\n", "\n", "#读入题库\n", "with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n", " database = f.read()\n", "pro_dict = json.loads(database)\n", "\n", "with open(filename,\"r\",encoding=\"u8\") as f:\n", " newdatabase = f.read()\n", "new_pro_list = GenerateProblemListFromString(newdatabase)\n", "\n", "pro_dict_treated = {}\n", "idrange_raw = generate_number_set(old_problems_range)\n", "idrange = [id for id in pro_dict if id in idrange_raw]\n", "for p in idrange:\n", " pro_dict_treated[p] = pre_treating(pro_dict[p][\"content\"])\n", "\n", "new_dict_treated = {}\n", "for i in range(len(new_pro_list)):\n", " new_dict_treated[i+1] = pre_treating(new_pro_list[i][0])\n", "\n", "for i in new_dict_treated:\n", " new_p = new_dict_treated[i]\n", " maxsim = 0\n", " for p in pro_dict_treated:\n", " old_p = pro_dict_treated[p]\n", " sim = sim_test(new_p,old_p)\n", " if sim > maxsim:\n", " maxsim = sim\n", " argmax = p\n", " print(\"%.3f\\t%d\\t%s\" %(maxsim,i,argmax))\n", " # print(\"\\n新题: %s\" %new_pro_list[i-1][0])\n", " # print(\"\\n原题: %s\\n\\n\\n\" %pro_dict[][\"content\"])\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "999999" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(idrange)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "mathdept", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc" } } }, "nbformat": 4, "nbformat_minor": 2 }