{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "旧题目数: 0 , 新题目数: 18100\n", "开始新题与旧题的比对\n", "50\n", "100\n", "150\n", "200\n", "250\n", "300\n", "350\n", "400\n", "450\n", "500\n", "550\n", "600\n", "650\n", "700\n", "750\n", "800\n", "850\n", "900\n", "950\n", "1000\n", "1050\n", "1100\n", "1150\n", "1200\n", "1250\n", "1300\n", "1350\n", "1400\n", "1450\n", "1500\n", "1550\n", "1600\n", "1650\n", "1700\n", "1750\n", "1800\n", "1850\n", "1900\n", "1950\n", "2000\n", "2050\n", "2100\n", "2150\n", "2200\n", "2250\n", "2300\n", "2350\n", "2400\n", "2450\n", "2500\n", "2550\n", "2600\n", "2650\n", "2700\n", "2750\n", "2800\n", "2850\n", "2900\n", "2950\n", "3000\n", "3050\n", "3100\n", "3150\n", "3200\n", "3250\n", "3300\n", "3350\n", "3400\n", "3450\n", "3500\n", "3550\n", "3600\n", "3650\n", "3700\n", "3750\n", "3800\n", "3850\n", "3900\n", "3950\n", "4000\n", "4050\n", "4100\n", "4150\n", "4200\n", "4250\n", "4300\n", "4350\n", "4400\n", "4450\n", "4500\n", "4550\n", "4600\n", "4650\n", "4700\n", "4750\n", "4800\n", "4850\n", "4900\n", "4950\n", "5000\n", "5050\n", "5100\n", "5150\n", "5200\n", "5250\n", "5300\n", "5350\n", "5400\n", "5450\n", "5500\n", "5550\n", "5600\n", "5650\n", "5700\n", "5750\n", "5800\n", "5850\n", "5900\n", "5950\n", "6000\n", "6050\n", "6100\n", "6150\n", "6200\n", "6250\n", "6300\n", "6350\n", "6400\n", "6450\n", "6500\n", "6550\n", "6600\n", "6650\n", "6700\n", "6750\n", "6800\n", "6850\n", "6900\n", "6950\n", "7000\n", "7050\n", "7100\n", "7150\n", "7200\n", "7250\n", "7300\n", "7350\n", "7400\n", "7450\n", "7500\n", "7550\n", "7600\n", "7650\n", "7700\n", "7750\n", "7800\n", "7850\n", "7900\n", "7950\n", "8000\n", "8050\n", "8100\n", "8150\n", "8200\n", "8250\n", "8300\n", "8350\n", "8400\n", "8450\n", "8500\n", "8550\n", "8600\n", "8650\n", "8700\n", "8750\n", "8800\n", "8850\n", "8900\n", "8950\n", "9000\n", "9050\n", "9100\n", "9150\n", "9200\n", "9250\n", "9300\n", "9350\n", "9400\n", "9450\n", "9500\n", "9550\n", "9600\n", "9650\n", "9700\n", "9750\n", "9800\n", "9850\n", "9900\n", "9950\n", "10000\n", "10050\n", "10100\n", "10150\n", "10200\n", "10250\n", "10300\n", "10350\n", "10400\n", "10450\n", "10500\n", "10550\n", "10600\n", "10650\n", "10700\n", "10750\n", "10800\n", "10850\n", "10900\n", "10950\n", "11000\n", "11050\n", "11100\n", "11150\n", "11200\n", "11250\n", "11300\n", "11350\n", "11400\n", "11450\n", "11500\n", "11550\n", "11600\n", "11650\n", "11700\n", "11750\n", "11800\n", "11850\n", "11900\n", "11950\n", "12000\n", "12050\n", "12100\n", "12150\n", "12200\n", "12250\n", "12300\n", "12350\n", "12400\n", "12450\n", "12500\n", "12550\n", "12600\n", "12650\n", "12700\n", "12750\n", "12800\n", "12850\n", "12900\n", "12950\n", "13000\n", "13050\n", "13100\n", "13150\n", "13200\n", "13250\n", "13300\n", "13350\n", "13400\n", "13450\n", "13500\n", "13550\n", "13600\n", "13650\n", "13700\n", "13750\n", "13800\n", "13850\n", "13900\n", "13950\n", "14000\n", "14050\n", "14100\n", "14150\n", "14200\n", "14250\n", "14300\n", "14350\n", "14400\n", "14450\n", "14500\n", "14550\n", "14600\n", "14650\n", "14700\n", "14750\n", "14800\n", "14850\n", "14900\n", "14950\n", "15000\n", "15050\n", "15100\n", "15150\n", "15200\n", "15250\n", "15300\n", "15350\n", "15400\n", "15450\n", "15500\n", "15550\n", "15600\n", "15650\n", "15700\n", "15750\n", "15800\n", "15850\n", "15900\n", "15950\n", "16000\n", "16050\n", "16100\n", "16150\n", "16200\n", "16250\n", "16300\n", "16350\n", "16400\n", "16450\n", "16500\n", "16550\n", "16600\n", "16650\n", "16700\n", "16750\n", "16800\n", "16850\n", "16900\n", "16950\n", "17000\n", "17050\n", "17100\n", "17150\n", "17200\n", "17250\n", "17300\n", "17350\n", "17400\n", "17450\n", "17500\n", "17550\n", "17600\n", "17650\n", "17700\n", "17750\n", "17800\n", "17850\n", "17900\n", "17950\n", "18000\n", "18050\n", "18100\n", "开始新题之间的比对\n", "50\n", "100\n", "150\n", "200\n", "250\n", "300\n", "350\n", "400\n", "450\n", "500\n", "550\n", "600\n", "650\n", "700\n", "750\n", "800\n", "850\n", "900\n", "950\n", "1000\n", "1050\n", "1100\n", "1150\n", "1200\n", "1250\n", "1300\n", "1350\n", "1400\n", "1450\n", "1500\n", "1550\n", "1600\n", "1650\n", "1700\n", "1750\n", "1800\n", "1850\n", "1900\n", "1950\n", "2000\n", "2050\n", "2100\n", "2150\n", "2200\n", "2250\n", "2300\n", "2350\n", "2400\n", "2450\n", "2500\n", "2550\n", "2600\n", "2650\n", "2700\n", "2750\n", "2800\n", "2850\n", "2900\n", "2950\n", "3000\n", "3050\n", "3100\n", "3150\n", "3200\n", "3250\n", "3300\n", "3350\n", "3400\n", "3450\n", "3500\n", "3550\n", "3600\n", "3650\n", "3700\n", "3750\n", "3800\n", "3850\n", "3900\n", "3950\n", "4000\n", "4050\n", "4100\n", "4150\n", "4200\n", "4250\n", "4300\n", "4350\n", "4400\n", "4450\n", "4500\n", "4550\n", "4600\n", "4650\n", "4700\n", "4750\n", "4800\n", "4850\n", "4900\n", "4950\n", "5000\n", "5050\n", "5100\n", "5150\n", "5200\n", "5250\n", "5300\n", "5350\n", "5400\n", "5450\n", "5500\n", "5550\n", "5600\n", "5650\n", "5700\n", "5750\n", "5800\n", "5850\n", "5900\n", "5950\n", "6000\n", "6050\n", "6100\n", "6150\n", "6200\n", "6250\n", "6300\n", "6350\n", "6400\n", "6450\n", "6500\n", "6550\n", "6600\n", "6650\n", "6700\n", "6750\n", "6800\n", "6850\n", "6900\n", "6950\n", "7000\n", "7050\n", "7100\n", "7150\n", "7200\n", "7250\n", "7300\n", "7350\n", "7400\n", "7450\n", "7500\n", "7550\n", "7600\n", "7650\n", "7700\n", "7750\n", "7800\n", "7850\n", "7900\n", "7950\n", "8000\n", "8050\n", "8100\n", "8150\n", "8200\n", "8250\n", "8300\n", "8350\n", "8400\n", "8450\n", "8500\n", "8550\n", "8600\n", "8650\n", "8700\n", "8750\n", "8800\n", "8850\n", "8900\n", "8950\n", "9000\n", "9050\n", "9100\n", "9150\n", "9200\n", "9250\n", "9300\n", "9350\n", "9400\n", "9450\n", "9500\n", "9550\n", "9600\n", "9650\n", "9700\n", "9750\n", "9800\n", "9850\n", "9900\n", "9950\n", "10000\n", "10050\n", "10100\n", "10150\n", "10200\n", "10250\n", "10300\n", "10350\n", "10400\n", "10450\n", "10500\n", "10550\n", "10600\n", "10650\n", "10700\n", "10750\n", "10800\n", "10850\n", "10900\n", "10950\n", "11000\n", "11050\n", "11100\n", "11150\n", "11200\n", "11250\n", "11300\n", "11350\n", "11400\n", "11450\n", "11500\n", "11550\n", "11600\n", "11650\n", "11700\n", "11750\n", "11800\n", "11850\n", "11900\n", "11950\n", "12000\n", "12050\n", "12100\n", "12150\n", "12200\n", "12250\n", "12300\n", "12350\n", "12400\n", "12450\n", "12500\n", "12550\n", "12600\n", "12650\n", "12700\n", "12750\n", "12800\n", "12850\n", "12900\n", "12950\n", "13000\n", "13050\n", "13100\n", "13150\n", "13200\n", "13250\n", "13300\n", "13350\n", "13400\n", "13450\n", "13500\n", "13550\n", "13600\n", "13650\n", "13700\n", "13750\n", "13800\n", "13850\n", "13900\n", "13950\n", "14000\n", "14050\n", "14100\n", "14150\n", "14200\n", "14250\n", "14300\n", "14350\n", "14400\n", "14450\n", "14500\n", "14550\n", "14600\n", "14650\n", "14700\n", "14750\n", "14800\n", "14850\n", "14900\n", "14950\n", "15000\n", "15050\n", "15100\n", "15150\n", "15200\n", "15250\n", "15300\n", "15350\n", "15400\n", "15450\n", "15500\n", "15550\n", "15600\n", "15650\n", "15700\n", "15750\n", "15800\n", "15850\n", "15900\n", "15950\n", "16000\n", "16050\n", "16100\n", "16150\n", "16200\n", "16250\n", "16300\n", "16350\n", "16400\n", "16450\n", "16500\n", "16550\n", "16600\n", "16650\n", "16700\n", "16750\n", "16800\n", "16850\n", "16900\n", "16950\n", "17000\n", "17050\n", "17100\n", "17150\n", "17200\n", "17250\n", "17300\n", "17350\n", "17400\n", "17450\n", "17500\n", "17550\n", "17600\n", "17650\n", "17700\n", "17750\n", "17800\n", "17850\n", "17900\n", "17950\n", "18000\n", "18050\n", "总耗时: 384.6457269191742 秒.\n", "发现相似: 2800 , 其中已标注: 2223 .\n" ] } ], "source": [ "# from hashlib import new\n", "import os,re,difflib,Levenshtein,time,json\n", "\n", "# 重要!!! 新题目的范围\n", "id_new_problems = \"1:50000\"\n", "threshold = 0.99\n", "\n", "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n", "def generate_number_set(string):\n", " string = re.sub(r\"[\\n\\s]\",\"\",string)\n", " string_list = string.split(\",\")\n", " numbers_list = []\n", " for s in string_list:\n", " if not \":\" in s:\n", " numbers_list.append(s.zfill(6))\n", " else:\n", " start,end = s.split(\":\")\n", " for ind in range(int(start),int(end)+1):\n", " numbers_list.append(str(ind).zfill(6))\n", " return numbers_list\n", "\n", "#字符串预处理\n", "def pre_treating(string):\n", " string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n", " string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)\",\"\",string)\n", " string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n", " string = re.sub(r\"[\\n\\t]\",\"\",string)\n", " string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n", " string = re.sub(r\"[,\\.:;?]\",\"\",string)\n", " return string\n", "\n", "#difflab字符串比较\n", "def difflab_get_equal_rate(str1, str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return difflib.SequenceMatcher(None, str1, str2).ratio()\n", "\n", "#Levenshtein jaro字符串比较\n", "def jaro_get_equal_rate(str1,str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return Levenshtein.jaro(str1,str2)\n", "\n", "#Levenshtein 字符串比较\n", "def Lev_get_equal_rate(str1,str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return Levenshtein.ratio(str1,str2)\n", "\n", "\n", "\n", "\n", "#指定对比方法\n", "sim_test = jaro_get_equal_rate\n", "\n", "#读入题库\n", "with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n", " database = f.read()\n", "pro_dict = json.loads(database)\n", "\n", "#生成旧题目数据库字典与新题目数据库字典\n", "new_id_list_raw = generate_number_set(id_new_problems)\n", "new_id_list = [id for id in pro_dict if id in new_id_list_raw]\n", "old_problems_dict = {}\n", "new_problems_dict = {}\n", "old_problems_dict_content = {}\n", "new_problems_dict_content = {}\n", "for id in pro_dict:\n", " if id in new_id_list:\n", " new_problems_dict[id] = pro_dict[id]\n", " new_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n", " else:\n", " old_problems_dict[id] = pro_dict[id]\n", " old_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n", "print(\"旧题目数:\",len(old_problems_dict),\", 新题目数:\",len(new_problems_dict))\n", "\n", "#记录起始时间\n", "start_time = time.time()\n", "suspect_count = 0\n", "remarked = 0\n", "\n", "alike_problems = \"\"\n", "\n", "\n", "\n", "#开始新题与旧题的比对\n", "count = 0\n", "print(\"开始新题与旧题的比对\")\n", "for id_new in new_problems_dict:\n", " count += 1\n", " if count % 50 == 0:\n", " print(count)\n", " for id_old in old_problems_dict:\n", " similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old])\n", " if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n", " alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + id_old + \" \" + old_problems_dict[id_old][\"content\"] + \"\\n\\n\"\n", " else:\n", " remarked += 1\n", "\n", "#开始新题之间的比对\n", "count = 0\n", "print(\"开始新题之间的比对\")\n", "while len(new_problems_dict) >= 2:\n", " count += 1\n", " if count % 50 == 0:\n", " print(count)\n", " keys = list(new_problems_dict.keys())\n", " current_problem = new_problems_dict.pop(keys[0])\n", " current_problem_content = new_problems_dict_content[current_problem[\"id\"]]\n", " for id_new in new_problems_dict:\n", " similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content)\n", " if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n", " alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + current_problem[\"id\"] + \" \" + current_problem[\"content\"] + \"\\n\\n\"\n", " else:\n", " remarked += 1\n", "\n", "\n", "#记录终止时间及显示结果\n", "end_time = time.time()\n", "print(\"总耗时:\",end_time-start_time,\"秒.\")\n", "print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n", "\n", "with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n", " f.write(alike_problems)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "mathdept", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc" } } }, "nbformat": 4, "nbformat_minor": 2 }