{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "题目数: 18795\n", "500\n", "1000\n", "1500\n", "2000\n", "2500\n", "3000\n", "3500\n", "4000\n", "4500\n", "5000\n", "5500\n", "6000\n", "6500\n", "7000\n", "7500\n", "8000\n", "8500\n", "9000\n", "9500\n", "10000\n", "10500\n", "11000\n", "11500\n", "12000\n", "12500\n", "13000\n", "13500\n", "14000\n", "14500\n", "15000\n", "15500\n", "16000\n", "16500\n", "17000\n", "17500\n", "18000\n", "耗时: 448.506秒\n" ] } ], "source": [ "import os,re,difflib,Levenshtein,time,json\n", "\n", "# 相同题目的阈值\n", "threshold = 0.99\n", "\n", "outputfile = r\"临时文件/相同题目列表.txt\"\n", "\n", "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n", "def generate_number_set(string):\n", " string = re.sub(r\"[\\n\\s]\",\"\",string)\n", " string_list = string.split(\",\")\n", " numbers_list = []\n", " for s in string_list:\n", " if not \":\" in s:\n", " numbers_list.append(s.zfill(6))\n", " else:\n", " start,end = s.split(\":\")\n", " for ind in range(int(start),int(end)+1):\n", " numbers_list.append(str(ind).zfill(6))\n", " return numbers_list\n", "\n", "#字符串预处理\n", "def pre_treating(string):\n", " string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n", " string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)\",\"\",string)\n", " string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n", " string = re.sub(r\"[\\n\\t]\",\"\",string)\n", " string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n", " string = re.sub(r\"[,\\.:;?]\",\"\",string)\n", " return string\n", "\n", "#difflab字符串比较\n", "def difflab_get_equal_rate(str1, str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return difflib.SequenceMatcher(None, str1, str2).ratio()\n", "\n", "#Levenshtein jaro字符串比较\n", "def jaro_get_equal_rate(str1,str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return Levenshtein.jaro(str1,str2)\n", "\n", "#Levenshtein 字符串比较\n", "def Lev_get_equal_rate(str1,str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return Levenshtein.ratio(str1,str2)\n", "\n", "\n", "\n", "\n", "#指定对比方法\n", "sim_test = jaro_get_equal_rate\n", "\n", "#读入题库\n", "with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n", " database = f.read()\n", "pro_dict = json.loads(database)\n", "\n", "pro_dict_treated = {}\n", "for id in pro_dict:\n", " pro_dict_treated[id] = pro_dict[id].copy()\n", " pro_dict_treated[id][\"content\"] = pre_treating(pro_dict_treated[id][\"content\"])\n", "\n", "\n", "print(\"题目数:\",len(pro_dict))\n", "\n", "#记录起始时间\n", "starttime = time.time()\n", "alike_problems = \"\"\n", "\n", "\n", "count = 0\n", "keys = list(pro_dict_treated.keys())\n", "while len(keys) >= 2:\n", " count += 1\n", " if count % 500 == 0:\n", " print(count)\n", " \n", " currentid = keys.pop(0)\n", " content1 = pro_dict_treated[currentid][\"content\"]\n", " same = []\n", " for id in keys:\n", " if not id in pro_dict[currentid][\"same\"] and not id in pro_dict[currentid][\"related\"]:\n", " content2 = pro_dict_treated[id][\"content\"]\n", " if sim_test(content1,content2)>threshold:\n", " same.append(id)\n", " if len(same) >= 1:\n", " # print(currentid)\n", " alike_problems += currentid + \",\"\n", " for i in same:\n", " # print(i)\n", " keys.pop(keys.index(i))\n", " alike_problems += \",\".join(same)\n", " alike_problems += \"\\n\\n\"\n", "\n", "endtime = time.time()\n", "print(\"耗时: %.3f秒\" %(endtime-starttime))\n", "\n", "with open(outputfile,\"w\",encoding = \"u8\") as f:\n", " f.write(alike_problems)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "mathdept", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }