{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "旧题目数: 1907 , 新题目数: 500\n", "开始新题与旧题的比对\n", "50\n", "100\n", "150\n", "200\n", "250\n", "300\n", "350\n", "400\n", "450\n", "500\n", "开始新题之间的比对\n", "50\n", "100\n", "150\n", "200\n", "250\n", "300\n", "350\n", "400\n", "450\n", "总耗时: 3.233793258666992 秒.\n", "发现相似: 30 , 其中已标注: 28 .\n" ] } ], "source": [ "import os,re,difflib,Levenshtein,time,json\n", "\n", "# 重要!!! 新旧题目的范围(有重复默认为新题)\n", "id_new_problems = \"1:500\"\n", "id_old_problems = \"30000:50000\"\n", "threshold = 0.95\n", "\n", "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n", "def generate_number_set(string):\n", " string = re.sub(r\"[\\n\\s]\",\"\",string)\n", " string_list = string.split(\",\")\n", " numbers_list = []\n", " for s in string_list:\n", " if not \":\" in s:\n", " numbers_list.append(s.zfill(6))\n", " else:\n", " start,end = s.split(\":\")\n", " for ind in range(int(start),int(end)+1):\n", " numbers_list.append(str(ind).zfill(6))\n", " return numbers_list\n", "\n", "#字符串预处理\n", "def pre_treating(string):\n", " string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n", " string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)\",\"\",string)\n", " string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n", " string = re.sub(r\"[\\n\\t]\",\"\",string)\n", " string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n", " string = re.sub(r\"[,\\.:;?]\",\"\",string)\n", " return string\n", "\n", "#difflab字符串比较\n", "def difflab_get_equal_rate(str1, str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return difflib.SequenceMatcher(None, str1, str2).ratio()\n", "\n", "#Levenshtein jaro字符串比较\n", "def jaro_get_equal_rate(str1,str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return Levenshtein.jaro(str1,str2)\n", "\n", "#Levenshtein 字符串比较\n", "def Lev_get_equal_rate(str1,str2):\n", " # str1 = pre_treating(str1)\n", " # str2 = pre_treating(str2)\n", " return Levenshtein.ratio(str1,str2)\n", "\n", "\n", "\n", "\n", "#指定对比方法\n", "sim_test = jaro_get_equal_rate\n", "\n", "#读入题库\n", "with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n", " database = f.read()\n", "pro_dict = json.loads(database)\n", "\n", "#生成旧题目数据库字典与新题目数据库字典\n", "new_id_list_raw = generate_number_set(id_new_problems)\n", "new_id_list = [id for id in pro_dict if id in new_id_list_raw]\n", "old_id_list_raw = generate_number_set(id_old_problems)\n", "old_id_list = [id for id in pro_dict if (id in old_id_list_raw and not id in new_id_list_raw)]\n", "old_problems_dict = {}\n", "new_problems_dict = {}\n", "old_problems_dict_content = {}\n", "new_problems_dict_content = {}\n", "for id in new_id_list:\n", " new_problems_dict[id] = pro_dict[id]\n", " new_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n", "for id in old_id_list:\n", " old_problems_dict[id] = pro_dict[id]\n", " old_problems_dict_content[id] = pre_treating(pro_dict[id][\"content\"])\n", "print(\"旧题目数:\",len(old_problems_dict),\", 新题目数:\",len(new_problems_dict))\n", "\n", "#记录起始时间\n", "start_time = time.time()\n", "suspect_count = 0\n", "remarked = 0\n", "\n", "alike_problems = \"\"\n", "\n", "\n", "\n", "#开始新题与旧题的比对\n", "count = 0\n", "print(\"开始新题与旧题的比对\")\n", "for id_new in new_problems_dict:\n", " count += 1\n", " if count % 50 == 0:\n", " print(count)\n", " for id_old in old_problems_dict:\n", " similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old])\n", " if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n", " alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + id_old + \" \" + old_problems_dict[id_old][\"content\"] + \"\\n\\n\"\n", " else:\n", " remarked += 1\n", "\n", "#开始新题之间的比对\n", "count = 0\n", "print(\"开始新题之间的比对\")\n", "while len(new_problems_dict) >= 2:\n", " count += 1\n", " if count % 50 == 0:\n", " print(count)\n", " keys = list(new_problems_dict.keys())\n", " current_problem = new_problems_dict.pop(keys[0])\n", " current_problem_content = new_problems_dict_content[current_problem[\"id\"]]\n", " for id_new in new_problems_dict:\n", " similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content)\n", " if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n", " alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + current_problem[\"id\"] + \" \" + current_problem[\"content\"] + \"\\n\\n\"\n", " else:\n", " remarked += 1\n", "\n", "\n", "#记录终止时间及显示结果\n", "end_time = time.time()\n", "print(\"总耗时:\",end_time-start_time,\"秒.\")\n", "print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n", "\n", "with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n", " f.write(alike_problems)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#记录起始时间\n", "start_time = time.time()\n", "suspect_count = 0\n", "remarked = 0\n", "\n", "alike_problems = \"\"\n", "\n", "\n", "\n", "#开始新题与旧题的比对\n", "count = 0\n", "print(\"开始新题与旧题的比对\")\n", "for id_new in new_problems_dict:\n", " count += 1\n", " if count % 50 == 0:\n", " print(count)\n", " for id_old in old_problems_dict:\n", " similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old])\n", " if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n", " alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + id_old + \" \" + old_problems_dict[id_old][\"content\"] + \"\\n\\n\"\n", " else:\n", " remarked += 1\n", "\n", "#开始新题之间的比对\n", "count = 0\n", "print(\"开始新题之间的比对\")\n", "while len(new_problems_dict) >= 2:\n", " count += 1\n", " if count % 50 == 0:\n", " print(count)\n", " keys = list(new_problems_dict.keys())\n", " current_problem = new_problems_dict.pop(keys[0])\n", " current_problem_content = new_problems_dict_content[current_problem[\"id\"]]\n", " for id_new in new_problems_dict:\n", " similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content)\n", " if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n", " alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + current_problem[\"id\"] + \" \" + current_problem[\"content\"] + \"\\n\\n\"\n", " else:\n", " remarked += 1\n", "\n", "\n", "#记录终止时间及显示结果\n", "end_time = time.time()\n", "print(\"总耗时:\",end_time-start_time,\"秒.\")\n", "print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n", "\n", "with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n", " f.write(alike_problems)" ] } ], "metadata": { "kernelspec": { "display_name": "mathdept", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc" } } }, "nbformat": 4, "nbformat_minor": 2 }