mathdeptv2/工具/相同题目检测.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "题目数: 18795\n",
      "500\n",
      "1000\n",
      "1500\n",
      "2000\n",
      "2500\n",
      "3000\n",
      "3500\n",
      "4000\n",
      "4500\n",
      "5000\n",
      "5500\n",
      "6000\n",
      "6500\n",
      "7000\n",
      "7500\n",
      "8000\n",
      "8500\n",
      "9000\n",
      "9500\n",
      "10000\n",
      "10500\n",
      "11000\n",
      "11500\n",
      "12000\n",
      "12500\n",
      "13000\n",
      "13500\n",
      "14000\n",
      "14500\n",
      "15000\n",
      "15500\n",
      "16000\n",
      "16500\n",
      "17000\n",
      "17500\n",
      "18000\n",
      "耗时: 448.506秒\n"
     ]
    }
   ],
   "source": [
    "import os,re,difflib,Levenshtein,time,json\n",
    "\n",
    "# 相同题目的阈值\n",
    "threshold = 0.99\n",
    "\n",
    "outputfile = r\"临时文件/相同题目列表.txt\"\n",
    "\n",
    "#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间\n",
    "def generate_number_set(string):\n",
    "    string = re.sub(r\"[\\n\\s]\",\"\",string)\n",
    "    string_list = string.split(\",\")\n",
    "    numbers_list = []\n",
    "    for s in string_list:\n",
    "        if not \":\" in s:\n",
    "            numbers_list.append(s.zfill(6))\n",
    "        else:\n",
    "            start,end = s.split(\":\")\n",
    "            for ind in range(int(start),int(end)+1):\n",
    "                numbers_list.append(str(ind).zfill(6))\n",
    "    return numbers_list\n",
    "\n",
    "#字符串预处理\n",
    "def pre_treating(string):\n",
    "    string = re.sub(r\"\\\\begin\\{center\\}[\\s\\S]*?\\\\end\\{center\\}\",\"\",string)\n",
    "    string = re.sub(r\"(bracket\\{\\d+\\})|(blank\\{\\d+\\})|(fourch)|(twoch)|(onech)\",\"\",string)\n",
    "    string = re.sub(r\"[\\s\\\\\\{\\}\\$\\(\\)\\[\\]]\",\"\",string)\n",
    "    string = re.sub(r\"[\\n\\t]\",\"\",string)\n",
    "    string = re.sub(r\"(displaystyle)|(overrightarrow)\",\"\",string)\n",
    "    string = re.sub(r\"[,\\.:;?]\",\"\",string)\n",
    "    return string\n",
    "\n",
    "#difflab字符串比较\n",
    "def difflab_get_equal_rate(str1, str2):\n",
    "    # str1 = pre_treating(str1)\n",
    "    # str2 = pre_treating(str2)\n",
    "    return difflib.SequenceMatcher(None, str1, str2).ratio()\n",
    "\n",
    "#Levenshtein jaro字符串比较\n",
    "def jaro_get_equal_rate(str1,str2):\n",
    "    # str1 = pre_treating(str1)\n",
    "    # str2 = pre_treating(str2)\n",
    "    return Levenshtein.jaro(str1,str2)\n",
    "\n",
    "#Levenshtein 字符串比较\n",
    "def Lev_get_equal_rate(str1,str2):\n",
    "    # str1 = pre_treating(str1)\n",
    "    # str2 = pre_treating(str2)\n",
    "    return Levenshtein.ratio(str1,str2)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "#指定对比方法\n",
    "sim_test = jaro_get_equal_rate\n",
    "\n",
    "#读入题库\n",
    "with open(r\"../题库0.3/Problems.json\",\"r\",encoding = \"utf8\") as f:\n",
    "    database = f.read()\n",
    "pro_dict = json.loads(database)\n",
    "\n",
    "pro_dict_treated = {}\n",
    "for id in pro_dict:\n",
    "    pro_dict_treated[id] = pro_dict[id].copy()\n",
    "    pro_dict_treated[id][\"content\"] = pre_treating(pro_dict_treated[id][\"content\"])\n",
    "\n",
    "\n",
    "print(\"题目数:\",len(pro_dict))\n",
    "\n",
    "#记录起始时间\n",
    "starttime = time.time()\n",
    "alike_problems = \"\"\n",
    "\n",
    "\n",
    "count = 0\n",
    "keys = list(pro_dict_treated.keys())\n",
    "while len(keys) >= 2:\n",
    "    count += 1\n",
    "    if count % 500 == 0:\n",
    "        print(count)\n",
    "    \n",
    "    currentid = keys.pop(0)\n",
    "    content1 = pro_dict_treated[currentid][\"content\"]\n",
    "    same = []\n",
    "    for id in keys:\n",
    "        if not id in pro_dict[currentid][\"same\"] and not id in pro_dict[currentid][\"related\"]:\n",
    "            content2 = pro_dict_treated[id][\"content\"]\n",
    "            if sim_test(content1,content2)>threshold:\n",
    "                same.append(id)\n",
    "    if len(same) >= 1:\n",
    "        # print(currentid)\n",
    "        alike_problems += currentid + \",\"\n",
    "        for i in same:\n",
    "            # print(i)\n",
    "            keys.pop(keys.index(i))\n",
    "        alike_problems += \",\".join(same)\n",
    "        alike_problems += \"\\n\\n\"\n",
    "\n",
    "endtime = time.time()\n",
    "print(\"耗时: %.3f秒\" %(endtime-starttime))\n",
    "\n",
    "with open(outputfile,\"w\",encoding = \"u8\") as f:\n",
    "    f.write(alike_problems)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mathdept",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}