{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from hashlib import new\n", "import os,re,difflib,Levenshtein,time,json\n", "\n", "# 重要!!! 新题目的范围\n", "id_new_problems = \"20000:20010\"\n", "threshold = 0.85\n", "\n", "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n", "def generate_number_set(string):\n", " string = re.sub(r\"[\\n\\s]\",\"\",string)\n", " string_list = string.split(\",\")\n", " numbers_list = []\n", " for s in string_list:\n", " if not \":\" in s:\n", " numbers_list.append(s.zfill(6))\n", " else:\n", " start,end = s.split(\":\")\n", " for ind in range(int(start),int(end)+1):\n", " numbers_list.append(str(ind).zfill(6))\n", " return numbers_list\n", "\n", "#字符串预处理\n", "def pre_treating(string):\n", " string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n", " string = re.sub(r\"[\\\\n\\\\t]\",\"\",string)\n", " string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n", " string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n", " return string\n", "\n", "#difflab字符串比较\n", "def difflab_get_equal_rate(str1, str2):\n", " str1 = pre_treating(str1)\n", " str2 = pre_treating(str2)\n", " return difflib.SequenceMatcher(None, str1, str2).ratio()\n", "\n", "#Levenshtein jaro字符串比较\n", "def jaro_get_equal_rate(str1,str2):\n", " str1 = pre_treating(str1)\n", " str2 = pre_treating(str2)\n", " return Levenshtein.jaro(str1,str2)\n", "\n", "#Levenshtein 字符串比较\n", "def Lev_get_equal_rate(str1,str2):\n", " str1 = pre_treating(str1)\n", " str2 = pre_treating(str2)\n", " return Levenshtein.ratio(str1,str2)\n", "\n", "\n", "#指定对比方法\n", "sim_test = jaro_get_equal_rate\n", "\n", "#读入题库\n", "with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n", " database = f.read()\n", "pro_dict = json.loads(database)\n", "\n", "#生成旧题目数据库字典与新题目数据库字典\n", "new_id_list = generate_number_set(id_new_problems)\n", "old_problems_dict = {}\n", "new_problems_dict = {}\n", "for id in pro_dict:\n", " if id in new_id_list:\n", " new_problems_dict[id] = pro_dict[id]\n", " else:\n", " old_problems_dict[id] = pro_dict[id]\n", "print(\"旧题目数:\",len(old_problems_dict),\", 新题目数:\",len(new_problems_dict))\n", "\n", "#记录起始时间\n", "start_time = time.time()\n", "suspect_count = 0\n", "remarked = 0\n", "\n", "alike_problems = \"\"\n", "\n", "#开始新题与旧题的比对\n", "count = 0\n", "print(\"开始新题与旧题的比对\")\n", "for id_new in new_problems_dict:\n", " count += 1\n", " if count % 50 == 0:\n", " print(count)\n", " for id_old in old_problems_dict:\n", " similar_rate = sim_test(new_problems_dict[id_new][\"content\"],old_problems_dict[id_old][\"content\"])\n", " if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n", " alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + id_old + \" \" + old_problems_dict[id_old][\"content\"] + \"\\n\\n\"\n", " else:\n", " remarked += 1\n", "\n", "#开始新题之间的比对\n", "count = 0\n", "print(\"开始新题之间的比对\")\n", "while len(new_problems_dict) >= 2:\n", " count += 1\n", " if count % 50 == 0:\n", " print(count)\n", " keys = list(new_problems_dict.keys())\n", " current_problem = new_problems_dict.pop(keys[0])\n", " for id_new in new_problems_dict:\n", " similar_rate = sim_test(new_problems_dict[id_new][\"content\"],current_problem[\"content\"])\n", " if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n", " suspect_count += 1\n", " if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n", " alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + current_problem[\"id\"] + \" \" + current_problem[\"content\"] + \"\\n\\n\"\n", " else:\n", " remarked += 1\n", "\n", "\n", "#记录终止时间及显示结果\n", "end_time = time.time()\n", "print(\"总耗时:\",end_time-start_time,\"秒.\")\n", "print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n", "\n", "with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n", " f.write(alike_problems)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.8 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "d311ffef239beb3b8f3764271728f3972d7b090c974f8e972fcdeedf230299ac" } } }, "nbformat": 4, "nbformat_minor": 2 }