{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from hashlib import new\n",
    "import os,re,difflib,Levenshtein,time,json\n",
    "\n",
    "# 重要!!! 新题目的范围\n",
    "id_new_problems = \"20000:20010\"\n",
    "threshold = 0.85\n",
    "\n",
    "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n",
    "def generate_number_set(string):\n",
    "    string = re.sub(r\"[\\n\\s]\",\"\",string)\n",
    "    string_list = string.split(\",\")\n",
    "    numbers_list = []\n",
    "    for s in string_list:\n",
    "        if not \":\" in s:\n",
    "            numbers_list.append(s.zfill(6))\n",
    "        else:\n",
    "            start,end = s.split(\":\")\n",
    "            for ind in range(int(start),int(end)+1):\n",
    "                numbers_list.append(str(ind).zfill(6))\n",
    "    return numbers_list\n",
    "\n",
    "#字符串预处理\n",
    "def pre_treating(string):\n",
    "    string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n",
    "    string = re.sub(r\"[\\\\n\\\\t]\",\"\",string)\n",
    "    string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n",
    "    string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n",
    "    return string\n",
    "\n",
    "#difflab字符串比较\n",
    "def difflab_get_equal_rate(str1, str2):\n",
    "    str1 = pre_treating(str1)\n",
    "    str2 = pre_treating(str2)\n",
    "    return difflib.SequenceMatcher(None, str1, str2).ratio()\n",
    "\n",
    "#Levenshtein jaro字符串比较\n",
    "def jaro_get_equal_rate(str1,str2):\n",
    "    str1 = pre_treating(str1)\n",
    "    str2 = pre_treating(str2)\n",
    "    return Levenshtein.jaro(str1,str2)\n",
    "\n",
    "#Levenshtein 字符串比较\n",
    "def Lev_get_equal_rate(str1,str2):\n",
    "    str1 = pre_treating(str1)\n",
    "    str2 = pre_treating(str2)\n",
    "    return Levenshtein.ratio(str1,str2)\n",
    "\n",
    "\n",
    "#指定对比方法\n",
    "sim_test = jaro_get_equal_rate\n",
    "\n",
    "#读入题库\n",
    "with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n",
    "    database = f.read()\n",
    "pro_dict = json.loads(database)\n",
    "\n",
    "#生成旧题目数据库字典与新题目数据库字典\n",
    "new_id_list = generate_number_set(id_new_problems)\n",
    "old_problems_dict = {}\n",
    "new_problems_dict = {}\n",
    "for id in pro_dict:\n",
    "    if id in new_id_list:\n",
    "        new_problems_dict[id] = pro_dict[id]\n",
    "    else:\n",
    "        old_problems_dict[id] = pro_dict[id]\n",
    "print(\"旧题目数:\",len(old_problems_dict),\", 新题目数:\",len(new_problems_dict))\n",
    "\n",
    "#记录起始时间\n",
    "start_time = time.time()\n",
    "suspect_count = 0\n",
    "remarked = 0\n",
    "\n",
    "alike_problems = \"\"\n",
    "\n",
    "#开始新题与旧题的比对\n",
    "count = 0\n",
    "print(\"开始新题与旧题的比对\")\n",
    "for id_new in new_problems_dict:\n",
    "    count += 1\n",
    "    if count % 50 == 0:\n",
    "        print(count)\n",
    "    for id_old in old_problems_dict:\n",
    "        similar_rate = sim_test(new_problems_dict[id_new][\"content\"],old_problems_dict[id_old][\"content\"])\n",
    "        if similar_rate > threshold or id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]:\n",
    "            suspect_count += 1\n",
    "            if not (id_new in old_problems_dict[id_old][\"related\"] or id_new in old_problems_dict[id_old][\"same\"] or id_old in new_problems_dict[id_new][\"related\"] or id_old in new_problems_dict[id_new][\"same\"]):\n",
    "                alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + id_old + \" \" + old_problems_dict[id_old][\"content\"] + \"\\n\\n\"\n",
    "            else:\n",
    "                remarked += 1\n",
    "\n",
    "#开始新题之间的比对\n",
    "count = 0\n",
    "print(\"开始新题之间的比对\")\n",
    "while len(new_problems_dict) >= 2:\n",
    "    count += 1\n",
    "    if count % 50 == 0:\n",
    "        print(count)\n",
    "    keys = list(new_problems_dict.keys())\n",
    "    current_problem = new_problems_dict.pop(keys[0])\n",
    "    for id_new in new_problems_dict:\n",
    "        similar_rate = sim_test(new_problems_dict[id_new][\"content\"],current_problem[\"content\"])\n",
    "        if similar_rate > threshold or id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]:\n",
    "            suspect_count += 1\n",
    "            if not (id_new in current_problem[\"related\"] or id_new in current_problem[\"same\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"related\"] or current_problem[\"id\"] in new_problems_dict[id_new][\"same\"]):\n",
    "                alike_problems += (\"%.4f\" %similar_rate) + \"\\n\\n\" + id_new + \" \" + new_problems_dict[id_new][\"content\"] + \"\\n\\n\" + current_problem[\"id\"] + \" \" + current_problem[\"content\"] + \"\\n\\n\"\n",
    "            else:\n",
    "                remarked += 1\n",
    "\n",
    "\n",
    "#记录终止时间及显示结果\n",
    "end_time = time.time()\n",
    "print(\"总耗时:\",end_time-start_time,\"秒.\")\n",
    "print(\"发现相似: \",suspect_count,\", 其中已标注: \",remarked,\".\")\n",
    "\n",
    "with open(\"临时文件/相似题目.txt\",\"w\",encoding=\"utf8\") as f:\n",
    "    f.write(alike_problems)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.8 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "d311ffef239beb3b8f3764271728f3972d7b090c974f8e972fcdeedf230299ac"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}