From f53b2f263a395323fed8d5ec1157c60ca6dd84ba Mon Sep 17 00:00:00 2001 From: "weiye.wang" Date: Wed, 8 Mar 2023 06:47:33 +0800 Subject: [PATCH] 20230308 morning --- 工具/修改题目数据库.ipynb | 6 +- 工具/相同题目检测.ipynb | 772 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 775 insertions(+), 3 deletions(-) create mode 100644 工具/相同题目检测.ipynb diff --git a/工具/修改题目数据库.ipynb b/工具/修改题目数据库.ipynb index 6ec9efa8..94cf4c7a 100644 --- a/工具/修改题目数据库.ipynb +++ b/工具/修改题目数据库.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 72, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -11,7 +11,7 @@ "0" ] }, - "execution_count": 72, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -19,7 +19,7 @@ "source": [ "import os,re,json\n", "\"\"\"这里编辑题号(列表)后将在vscode中打开窗口, 编辑后保存关闭, 随后运行第二个代码块\"\"\"\n", - "problems = \"12709:12712\"\n", + "problems = \"003683,003698,011567,013880\"\n", "\n", "def generate_number_set(string,dict):\n", " string = re.sub(r\"[\\n\\s]\",\"\",string)\n", diff --git a/工具/相同题目检测.ipynb b/工具/相同题目检测.ipynb new file mode 100644 index 00000000..c668514f --- /dev/null +++ b/工具/相同题目检测.ipynb @@ -0,0 +1,772 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "题目数: 18162\n", + "000016\n", + "007761\n", + "\n", + "\n", + "\n", + "000022\n", + "007828\n", + "\n", + "\n", + "\n", + "50\n", + "000076\n", + "007924\n", + "\n", + "\n", + "\n", + "000089\n", + "007939\n", + "\n", + "\n", + "\n", + "000097\n", + "008220\n", + "\n", + "\n", + "\n", + "100\n", + "000100\n", + "008380\n", + "\n", + "\n", + "\n", + "000101\n", + "008198\n", + "\n", + "\n", + "\n", + "000104\n", + "008385\n", + "\n", + "\n", + "\n", + "000109\n", + "008231\n", + "\n", + "\n", + "\n", + "000116\n", + "008210\n", + "\n", + "\n", + "\n", + "150\n", + "200\n", + "250\n", + "000254\n", + "008823\n", + "\n", + "\n", + "\n", + "000266\n", + "022019\n", + "\n", + "\n", + "\n", + "000273\n", + "008847\n", + "\n", + "\n", + "\n", + "000276\n", + "008941\n", + "\n", + "\n", + "\n", + "000281\n", + "008942\n", + "\n", + "\n", + "\n", + "300\n", + "000308\n", + "008516\n", + "\n", + "\n", + "\n", + "000310\n", + "008519\n", + "\n", + "\n", + "\n", + "000314\n", + "008450\n", + "\n", + "\n", + "\n", + "000329\n", + "011073\n", + "\n", + "\n", + "\n", + "000330\n", + "011074\n", + "\n", + "\n", + "\n", + "000338\n", + "012422\n", + "\n", + "\n", + "\n", + "000347\n", + "010926\n", + "\n", + "\n", + "\n", + "000349\n", + "010927\n", + "\n", + "\n", + "\n", + "350\n", + "000351\n", + "010929\n", + "\n", + "\n", + "\n", + "000354\n", + "010932\n", + "\n", + "\n", + "\n", + "000374\n", + "011014\n", + "\n", + "\n", + "\n", + "000398\n", + "012078\n", + "\n", + "\n", + "\n", + "400\n", + "000413\n", + "004432\n", + "\n", + "\n", + "\n", + "000417\n", + "011093\n", + "\n", + "\n", + "\n", + "000421\n", + "011096\n", + "\n", + "\n", + "\n", + "000423\n", + "011098\n", + "\n", + "\n", + "\n", + "000424\n", + "011099\n", + "\n", + "\n", + "\n", + "000425\n", + "011100\n", + "\n", + "\n", + "\n", + "000426\n", + "011536\n", + "\n", + "\n", + "\n", + "000427\n", + "011537\n", + "\n", + "\n", + "\n", + "000428\n", + "011538\n", + "\n", + "\n", + "\n", + "000430\n", + "011540\n", + "\n", + "\n", + "\n", + "000431\n", + "011541\n", + "\n", + "\n", + "\n", + "000432\n", + "011542\n", + "\n", + "\n", + "\n", + "000435\n", + "004475\n", + "\n", + "\n", + "\n", + "000437\n", + "004427\n", + "\n", + "\n", + "\n", + "000443\n", + "004497\n", + "\n", + "\n", + "\n", + "450\n", + "000467\n", + "000878\n", + "\n", + "\n", + "\n", + "000469\n", + "012332\n", + "\n", + "\n", + "\n", + "000496\n", + "010965\n", + "\n", + "\n", + "\n", + "000498\n", + "010966\n", + "\n", + "\n", + "\n", + "000499\n", + "010967\n", + "\n", + "\n", + "\n", + "500\n", + "000500\n", + "010968\n", + "\n", + "\n", + "\n", + "000503\n", + "010972\n", + "\n", + "\n", + "\n", + "000512\n", + "011586\n", + "\n", + "\n", + "\n", + "000521\n", + "004430\n", + "\n", + "\n", + "\n", + "550\n", + "000559\n", + "010970\n", + "\n", + "\n", + "\n", + "000561\n", + "012661\n", + "\n", + "\n", + "\n", + "600\n", + "000613\n", + "013591\n", + "\n", + "\n", + "\n", + "000616\n", + "004143\n", + "\n", + "\n", + "\n", + "000617\n", + "004144\n", + "\n", + "\n", + "\n", + "000618\n", + "004145\n", + "\n", + "\n", + "\n", + "000619\n", + "004146\n", + "\n", + "\n", + "\n", + "000620\n", + "004148\n", + "\n", + "\n", + "\n", + "000629\n", + "011603\n", + "\n", + "\n", + "\n", + "000636\n", + "004080\n", + "\n", + "\n", + "\n", + "000637\n", + "004081\n", + "\n", + "\n", + "\n", + "000638\n", + "004082\n", + "\n", + "\n", + "\n", + "650\n", + "000652\n", + "011395\n", + "\n", + "\n", + "\n", + "000667\n", + "004060\n", + "\n", + "\n", + "\n", + "000673\n", + "004066\n", + "\n", + "\n", + "\n", + "000679\n", + "011371\n", + "\n", + "\n", + "\n", + "000680\n", + "011372\n", + "\n", + "\n", + "\n", + "000682\n", + "011374\n", + "\n", + "\n", + "\n", + "000684\n", + "011375\n", + "\n", + "\n", + "\n", + "000691\n", + "004085\n", + "\n", + "\n", + "\n", + "000693\n", + "004086\n", + "\n", + "\n", + "\n", + "000695\n", + "004087\n", + "\n", + "\n", + "\n", + "700\n", + "000707\n", + "011516\n", + "\n", + "\n", + "\n", + "000708\n", + "011517\n", + "\n", + "\n", + "\n", + "000709\n", + "011518\n", + "\n", + "\n", + "\n", + "000710\n", + "011519\n", + "\n", + "\n", + "\n", + "000711\n", + "011520\n", + "\n", + "\n", + "\n", + "000712\n", + "011521\n", + "\n", + "\n", + "\n", + "000717\n", + "011306\n", + "\n", + "\n", + "\n", + "000718\n", + "011307\n", + "\n", + "\n", + "\n", + "000721\n", + "013304\n", + "\n", + "\n", + "\n", + "750\n", + "000754\n", + "004088\n", + "\n", + "\n", + "\n", + "000757\n", + "004312\n", + "011389\n", + "\n", + "\n", + "\n", + "000758\n", + "011390\n", + "\n", + "\n", + "\n", + "000763\n", + "004319\n", + "\n", + "\n", + "\n", + "000764\n", + "004318\n", + "\n", + "\n", + "\n", + "000766\n", + "004290\n", + "\n", + "\n", + "\n", + "000767\n", + "004291\n", + "\n", + "\n", + "\n", + "000768\n", + "004292\n", + "\n", + "\n", + "\n", + "000769\n", + "004293\n", + "\n", + "\n", + "\n", + "000771\n", + "004295\n", + "\n", + "\n", + "\n", + "000773\n", + "004297\n", + "\n", + "\n", + "\n", + "000774\n", + "004298\n", + "\n", + "\n", + "\n", + "000775\n", + "004299\n", + "\n", + "\n", + "\n", + "000790\n", + "011309\n", + "\n", + "\n", + "\n", + "000795\n", + "030043\n", + "\n", + "\n", + "\n", + "800\n", + "000804\n", + "004065\n", + "\n", + "\n", + "\n", + "000818\n", + "011391\n", + "\n", + "\n", + "\n", + "000819\n", + "011392\n", + "\n", + "\n", + "\n", + "000820\n", + "011393\n", + "\n", + "\n", + "\n", + "000821\n", + "011394\n", + "\n", + "\n", + "\n", + "000824\n", + "011398\n", + "\n", + "\n", + "\n", + "000825\n", + "011399\n", + "\n", + "\n", + "\n", + "000836\n", + "011368\n", + "\n", + "\n", + "\n", + "000837\n", + "011369\n", + "\n", + "\n", + "\n", + "000838\n", + "011370\n", + "\n", + "\n", + "\n", + "000841\n", + "011373\n", + "\n", + "\n", + "\n", + "000844\n", + "011376\n", + "\n", + "\n", + "\n", + "000845\n", + "011377\n", + "\n", + "\n", + "\n", + "850\n", + "900\n", + "000911\n", + "011328\n", + "013764\n", + "\n", + "\n", + "\n", + "000922\n", + "011582\n", + "\n", + "\n", + "\n", + "000942\n", + "013377\n", + "\n", + "\n", + "\n", + "950\n", + "1000\n", + "1050\n", + "1100\n", + "1150\n", + "1200\n", + "1250\n", + "001294\n", + "007953\n", + "\n", + "\n", + "\n", + "1300\n", + "1350\n", + "001351\n", + "020486\n", + "\n", + "\n", + "\n", + "001353\n", + "008074\n", + "020393\n", + "\n", + "\n", + "\n", + "001375\n", + "001376\n", + "\n", + "\n", + "\n", + "001384\n", + "008206\n", + "\n", + "\n", + "\n", + "1400\n" + ] + } + ], + "source": [ + "import os,re,difflib,Levenshtein,time,json\n", + "\n", + "# 相同题目的阈值\n", + "threshold = 0.99\n", + "\n", + "outputfile = r\"../临时文件/相同题目列表.txt\"\n", + "\n", + "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n", + "def generate_number_set(string):\n", + " string = re.sub(r\"[\\n\\s]\",\"\",string)\n", + " string_list = string.split(\",\")\n", + " numbers_list = []\n", + " for s in string_list:\n", + " if not \":\" in s:\n", + " numbers_list.append(s.zfill(6))\n", + " else:\n", + " start,end = s.split(\":\")\n", + " for ind in range(int(start),int(end)+1):\n", + " numbers_list.append(str(ind).zfill(6))\n", + " return numbers_list\n", + "\n", + "#字符串预处理\n", + "def pre_treating(string):\n", + " string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n", + " string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)\",\"\",string)\n", + " string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n", + " string = re.sub(r\"[\\n\\t]\",\"\",string)\n", + " string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n", + " string = re.sub(r\"[,\\.:;?]\",\"\",string)\n", + " return string\n", + "\n", + "#difflab字符串比较\n", + "def difflab_get_equal_rate(str1, str2):\n", + " # str1 = pre_treating(str1)\n", + " # str2 = pre_treating(str2)\n", + " return difflib.SequenceMatcher(None, str1, str2).ratio()\n", + "\n", + "#Levenshtein jaro字符串比较\n", + "def jaro_get_equal_rate(str1,str2):\n", + " # str1 = pre_treating(str1)\n", + " # str2 = pre_treating(str2)\n", + " return Levenshtein.jaro(str1,str2)\n", + "\n", + "#Levenshtein 字符串比较\n", + "def Lev_get_equal_rate(str1,str2):\n", + " # str1 = pre_treating(str1)\n", + " # str2 = pre_treating(str2)\n", + " return Levenshtein.ratio(str1,str2)\n", + "\n", + "\n", + "\n", + "\n", + "#指定对比方法\n", + "sim_test = jaro_get_equal_rate\n", + "\n", + "#读入题库\n", + "with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n", + " database = f.read()\n", + "pro_dict = json.loads(database)\n", + "\n", + "pro_dict_treated = {}\n", + "for id in pro_dict:\n", + " pro_dict_treated[id] = pro_dict[id].copy()\n", + " pro_dict_treated[id][\"content\"] = pre_treating(pro_dict_treated[id][\"content\"])\n", + "\n", + "\n", + "print(\"题目数:\",len(pro_dict))\n", + "\n", + "#记录起始时间\n", + "starttime = time.time()\n", + "alike_problems = \"\"\n", + "\n", + "\n", + "count = 0\n", + "keys = list(pro_dict_treated.keys())\n", + "while len(keys) >= 2:\n", + " count += 1\n", + " if count % 50 == 0:\n", + " print(count)\n", + " \n", + " currentid = keys.pop(0)\n", + " content1 = pro_dict_treated[currentid][\"content\"]\n", + " same = []\n", + " for id in keys:\n", + " content2 = pro_dict_treated[id][\"content\"]\n", + " if sim_test(content1,content2)>threshold:\n", + " same.append(id)\n", + " if len(same) >= 1:\n", + " print(currentid)\n", + " alike_problems += currentid + \",\"\n", + " for i in same:\n", + " print(i)\n", + " keys.pop(keys.index(i))\n", + " alike_problems += \",\".join(same)\n", + " alike_problems += \"\\n\"\n", + " print(\"\\n\\n\")\n", + "\n", + "endtime = time.time()\n", + "print(\"耗时: %.3f秒\" %(endtime-starttime))\n", + "\n", + "with open(outputfile,\"w\",encoding = \"u8\") as f:\n", + " f.write(alike_problems)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mathdept", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}