{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "题目数: 18162\n", "000016\n", "007761\n", "\n", "\n", "\n", "000022\n", "007828\n", "\n", "\n", "\n", "50\n", "000076\n", "007924\n", "\n", "\n", "\n", "000089\n", "007939\n", "\n", "\n", "\n", "000097\n", "008220\n", "\n", "\n", "\n", "100\n", "000100\n", "008380\n", "\n", "\n", "\n", "000101\n", "008198\n", "\n", "\n", "\n", "000104\n", "008385\n", "\n", "\n", "\n", "000109\n", "008231\n", "\n", "\n", "\n", "000116\n", "008210\n", "\n", "\n", "\n", "150\n", "200\n", "250\n", "000254\n", "008823\n", "\n", "\n", "\n", "000266\n", "022019\n", "\n", "\n", "\n", "000273\n", "008847\n", "\n", "\n", "\n", "000276\n", "008941\n", "\n", "\n", "\n", "000281\n", "008942\n", "\n", "\n", "\n", "300\n", "000308\n", "008516\n", "\n", "\n", "\n", "000310\n", "008519\n", "\n", "\n", "\n", "000314\n", "008450\n", "\n", "\n", "\n", "000329\n", "011073\n", "\n", "\n", "\n", "000330\n", "011074\n", "\n", "\n", "\n", "000338\n", "012422\n", "\n", "\n", "\n", "000347\n", "010926\n", "\n", "\n", "\n", "000349\n", "010927\n", "\n", "\n", "\n", "350\n", "000351\n", "010929\n", "\n", "\n", "\n", "000354\n", "010932\n", "\n", "\n", "\n", "000374\n", "011014\n", "\n", "\n", "\n", "000398\n", "012078\n", "\n", "\n", "\n", "400\n", "000413\n", "004432\n", "\n", "\n", "\n", "000417\n", "011093\n", "\n", "\n", "\n", "000421\n", "011096\n", "\n", "\n", "\n", "000423\n", "011098\n", "\n", "\n", "\n", "000424\n", "011099\n", "\n", "\n", "\n", "000425\n", "011100\n", "\n", "\n", "\n", "000426\n", "011536\n", "\n", "\n", "\n", "000427\n", "011537\n", "\n", "\n", "\n", "000428\n", "011538\n", "\n", "\n", "\n", "000430\n", "011540\n", "\n", "\n", "\n", "000431\n", "011541\n", "\n", "\n", "\n", "000432\n", "011542\n", "\n", "\n", "\n", "000435\n", "004475\n", "\n", "\n", "\n", "000437\n", "004427\n", "\n", "\n", "\n", "000443\n", "004497\n", "\n", "\n", "\n", "450\n", "000467\n", "000878\n", "\n", "\n", "\n", "000469\n", "012332\n", "\n", "\n", "\n", "000496\n", "010965\n", "\n", "\n", "\n", "000498\n", "010966\n", "\n", "\n", "\n", "000499\n", "010967\n", "\n", "\n", "\n", "500\n", "000500\n", "010968\n", "\n", "\n", "\n", "000503\n", "010972\n", "\n", "\n", "\n", "000512\n", "011586\n", "\n", "\n", "\n", "000521\n", "004430\n", "\n", "\n", "\n", "550\n", "000559\n", "010970\n", "\n", "\n", "\n", "000561\n", "012661\n", "\n", "\n", "\n", "600\n", "000613\n", "013591\n", "\n", "\n", "\n", "000616\n", "004143\n", "\n", "\n", "\n", "000617\n", "004144\n", "\n", "\n", "\n", "000618\n", "004145\n", "\n", "\n", "\n", "000619\n", "004146\n", "\n", "\n", "\n", "000620\n", "004148\n", "\n", "\n", "\n", "000629\n", "011603\n", "\n", "\n", "\n", "000636\n", "004080\n", "\n", "\n", "\n", "000637\n", "004081\n", "\n", "\n", "\n", "000638\n", "004082\n", "\n", "\n", "\n", "650\n", "000652\n", "011395\n", "\n", "\n", "\n", "000667\n", "004060\n", "\n", "\n", "\n", "000673\n", "004066\n", "\n", "\n", "\n", "000679\n", "011371\n", "\n", "\n", "\n", "000680\n", "011372\n", "\n", "\n", "\n", "000682\n", "011374\n", "\n", "\n", "\n", "000684\n", "011375\n", "\n", "\n", "\n", "000691\n", "004085\n", "\n", "\n", "\n", "000693\n", "004086\n", "\n", "\n", "\n", "000695\n", "004087\n", "\n", "\n", "\n", "700\n", "000707\n", "011516\n", "\n", "\n", "\n", "000708\n", "011517\n", "\n", "\n", "\n", "000709\n", "011518\n", "\n", "\n", "\n", "000710\n", "011519\n", "\n", "\n", "\n", "000711\n", "011520\n", "\n", "\n", "\n", "000712\n", "011521\n", "\n", "\n", "\n", "000717\n", "011306\n", "\n", "\n", "\n", "000718\n", "011307\n", "\n", "\n", "\n", "000721\n", "013304\n", "\n", "\n", "\n", "750\n", "000754\n", "004088\n", "\n", "\n", "\n", "000757\n", "004312\n", "011389\n", "\n", "\n", "\n", "000758\n", "011390\n", "\n", "\n", "\n", "000763\n", "004319\n", "\n", "\n", "\n", "000764\n", "004318\n", "\n", "\n", "\n", "000766\n", "004290\n", "\n", "\n", "\n", "000767\n", "004291\n", "\n", "\n", "\n", "000768\n", "004292\n", "\n", "\n", "\n", "000769\n", "004293\n", "\n", "\n", "\n", "000771\n", "004295\n", "\n", "\n", "\n", "000773\n", "004297\n", "\n", "\n", "\n", "000774\n", "004298\n", "\n", "\n", "\n", "000775\n", "004299\n", "\n", "\n", "\n", "000790\n", "011309\n", "\n", "\n", "\n", "000795\n", "030043\n", "\n", "\n", "\n", "800\n", "000804\n", "004065\n", "\n", "\n", "\n", "000818\n", "011391\n", "\n", "\n", "\n", "000819\n", "011392\n", "\n", "\n", "\n", "000820\n", "011393\n", "\n", "\n", "\n", "000821\n", "011394\n", "\n", "\n", "\n", "000824\n", "011398\n", "\n", "\n", "\n", "000825\n", "011399\n", "\n", "\n", "\n", "000836\n", "011368\n", "\n", "\n", "\n", "000837\n", "011369\n", "\n", "\n", "\n", "000838\n", "011370\n", "\n", "\n", "\n", "000841\n", "011373\n", "\n", "\n", "\n", "000844\n", "011376\n", "\n", "\n", "\n", "000845\n", "011377\n", "\n", "\n", "\n", "850\n", "900\n", "000911\n", "011328\n", "013764\n", "\n", "\n", "\n", "000922\n", "011582\n", "\n", "\n", "\n", "000942\n", "013377\n", "\n", "\n", "\n", "950\n", "1000\n", "1050\n", "1100\n", "1150\n", "1200\n", "1250\n", "001294\n", "007953\n", "\n", "\n", "\n", "1300\n", "1350\n", "001351\n", "020486\n", "\n", "\n", "\n", "001353\n", "008074\n", "020393\n", "\n", "\n", "\n", "001375\n", "001376\n", "\n", "\n", "\n", "001384\n", "008206\n", "\n", "\n", "\n", "1400\n" ] } ], "source": [ "import os,re,difflib,Levenshtein,time,json\n", "\n", "# 相同题目的阈值\n", "threshold = 0.99\n", "\n", "outputfile = r\"../临时文件/相同题目列表.txt\"\n", "\n", "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n", "def generate_number_set(string):\n", " string = re.sub(r\"[\\n\\s]\",\"\",string)\n", " string_list = string.split(\",\")\n", " numbers_list = []\n", " for s in string_list:\n", " if not \":\" in s:\n", " numbers_list.append(s.zfill(6))\n", " else:\n", " start,end = s.split(\":\")\n", " for ind in range(int(start),int(end)+1):\n", " numbers_list.append(str(ind).zfill(6))\n", " return numbers_list\n", "\n", "#字符串预处理\n", "def pre_treating(string):\n", " string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n", " string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)\",\"\",string)\n", " string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n", " string = re.sub(r\"[\\n\\t]\",\"\",string)\n", " string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n", " string = re.sub(r\"[,\\.:;?]\",\"\",string)\n", " return string\n", "\n", "#difflab字符串比较\n", "def difflab_get_equal_rate(str1, str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return difflib.SequenceMatcher(None, str1, str2).ratio()\n", "\n", "#Levenshtein jaro字符串比较\n", "def jaro_get_equal_rate(str1,str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return Levenshtein.jaro(str1,str2)\n", "\n", "#Levenshtein 字符串比较\n", "def Lev_get_equal_rate(str1,str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return Levenshtein.ratio(str1,str2)\n", "\n", "\n", "\n", "\n", "#指定对比方法\n", "sim_test = jaro_get_equal_rate\n", "\n", "#读入题库\n", "with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n", " database = f.read()\n", "pro_dict = json.loads(database)\n", "\n", "pro_dict_treated = {}\n", "for id in pro_dict:\n", " pro_dict_treated[id] = pro_dict[id].copy()\n", " pro_dict_treated[id][\"content\"] = pre_treating(pro_dict_treated[id][\"content\"])\n", "\n", "\n", "print(\"题目数:\",len(pro_dict))\n", "\n", "#记录起始时间\n", "starttime = time.time()\n", "alike_problems = \"\"\n", "\n", "\n", "count = 0\n", "keys = list(pro_dict_treated.keys())\n", "while len(keys) >= 2:\n", " count += 1\n", " if count % 50 == 0:\n", " print(count)\n", " \n", " currentid = keys.pop(0)\n", " content1 = pro_dict_treated[currentid][\"content\"]\n", " same = []\n", " for id in keys:\n", " content2 = pro_dict_treated[id][\"content\"]\n", " if sim_test(content1,content2)>threshold:\n", " same.append(id)\n", " if len(same) >= 1:\n", " print(currentid)\n", " alike_problems += currentid + \",\"\n", " for i in same:\n", " print(i)\n", " keys.pop(keys.index(i))\n", " alike_problems += \",\".join(same)\n", " alike_problems += \"\\n\"\n", " print(\"\\n\\n\")\n", "\n", "endtime = time.time()\n", "print(\"耗时: %.3f秒\" %(endtime-starttime))\n", "\n", "with open(outputfile,\"w\",encoding = \"u8\") as f:\n", " f.write(alike_problems)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "mathdept", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }