{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os,re\n", "import win32clipboard as wc\n", "import win32con\n", "\n", "# 获取剪切板内容\n", "def getCopy():\n", " wc.OpenClipboard()\n", " t = wc.GetClipboardData(win32con.CF_UNICODETEXT)\n", " wc.CloseClipboard()\n", " return t\n", "\n", "# 写入剪切板内容\n", "def setCopy(str):\n", " wc.OpenClipboard()\n", " wc.EmptyClipboard()\n", " wc.SetClipboardData(win32con.CF_UNICODETEXT, str)\n", " wc.CloseClipboard()\n", "\n", "def full_stop(matchobj):\n", " if matchobj.group(1) == \"。\" or matchobj.group(1) == \".\":\n", " return \". \"\n", " else:\n", " return \".\\n\"\n", "def refine_brackets(matchobj):\n", " return matchobj.group(1)[1:-1]\n", "def insert_a_blank(matchobj):\n", " return matchobj.group(1)[:-1]+\" \"+matchobj.group(1)[-1]\n", "def multiple_choice(matchobj):\n", " string = \"\\\\fourch\" + \"{\" + matchobj.group(1) + \"}{\" + matchobj.group(2) + \"}{\" + matchobj.group(3) + \"}{\" + matchobj.group(4) + \"}\\n\"\n", " return string\n", "def boldsymbols(matchobj):\n", " return \"\\\\i\"+matchobj.group(1)[:-1]+\"\\\\mathbf{\"+matchobj.group(1)[-1]+\"}\"\n", "def boldsymbols_star(matchobj):\n", " return \"\\\\in \\\\mathbf{\"+matchobj.group(1)+\"}^*\"\n", "def singleboldsymbols(matchobj):\n", " return \"$\\\\mathbf{\" + matchobj.group(1) + \"}$\"\n", "def blackboardbold(matchobj):\n", " string = \"\\\\mathbf\" + \"{\" + matchobj.group(1) + \"}\"\n", " return string\n", "def limit(matchobj):\n", " return \"\\\\displaystyle\\\\lim_{\"+matchobj.group(1)+\"}\"\n", "def replace_i(matchobj):\n", " string = matchobj.group(1)\n", " length = len(string)\n", " for i in range(length-1,-1,-1):\n", " if string[i] == \"i\" and not \"item\" in string[i:] and not \"overline\" in string[i:]:\n", " string = string[:i] + \"\\\\mathrm{i}\" + string[i+1:]\n", " return string\n", "def refine_log(matchobj):\n", " return r\"\\log_\"+matchobj.group(1)\n", "def refine_powers(matchobj):\n", " base = matchobj.group(1)\n", " power = matchobj.group(2)\n", " return base + \"^\" + power\n", "def refine_sequences(matchobj):\n", " return \"\\{\" + matchobj.group(1) + \"\\}\"\n", "def refine_starting_brackets(matchobj):\n", " return \"$\" + matchobj.group(1)\n", "def refine_left_operating_brackets(matchobj):\n", " obj = matchobj.group(2)\n", " return matchobj.group(1)+obj\n", "def refine_right_operating_brackets(matchobj):\n", " obj = matchobj.group(1)\n", " return obj + matchobj.group(2)\n", "def refine_brackets_in_brackets(matchobj):\n", " return matchobj.group(1) + matchobj.group(2) + matchobj.group(3)\n", "def mathbf(matchobj):\n", " return \"\\\\mathbf{\" + matchobj.group(1) + \"}^\" + matchobj.group(2)\n", "#以上是202207之前的文本处理机制\n", "global layer\n", "def rename_bracket(matchobj):\n", " return \"leftbracket\" + str(layer) + matchobj.group(1) + \"rightbracket\" + str(layer)\n", "def frac_brackets(matchobj):\n", " return \"frac{\"+matchobj.group(1)+\"}{\"+matchobj.group(2)+\"}\"\n", "def frac_single_second_bracket(matchobj):\n", " return \"frac \"+matchobj.group(1)+\"{\"+matchobj.group(2)+\"}\"\n", "def recall_vital_bracket(matchobj):\n", " return matchobj.group(1) + \"{\" + matchobj.group(2) + \"}\"\n", "def sqrt_brackets(matchobj):\n", " if matchobj.group(1) == None:\n", " first_group = \"\"\n", " else:\n", " first_group = matchobj.group(1)\n", " return \"sqrt \"+ first_group +\"{\" + matchobj.group(2) + \"}\"\n", "#def refine_frac(string):\n", "# for s in range(7):\n", "# for t in range(7):\n", "# string = re.sub(r\"frac[\\s]*leftbracket\"+str(s)+\"(.*?)\"+r\"rightbracket\"+str(s)+\"[\\s]*\"+r\"leftbracket\"+str(t)+\"(.*?)\"+r\"rightbracket\"+str(t),frac_brackets,string)\n", "# return string\n", "def refine_single_second_frac(string):\n", " for s in range(7):\n", " string = re.sub(r\"frac[\\s]*(\\w)[\\s]*leftbracket\"+str(s)+\"(.*?)\"+r\"rightbracket\"+str(s),frac_single_second_bracket,string)\n", " return string\n", "def refine_vital_bracket(string):\n", " for s in range(7):\n", " string = re.sub(r\"(frac)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", " string = re.sub(r\"(line)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", " string = re.sub(r\"(arrow)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", " string = re.sub(r\"(_)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", " string = re.sub(r\"(\\^)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", " string = re.sub(r\"(mathrm)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", " string = re.sub(r\"(mathbf)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", " string = re.sub(r\"(begin)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", " string = re.sub(r\"(end)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n", " return string\n", "def refine_sqrt(string):\n", " for s in range(7):\n", " string = re.sub(r\"sqrt[\\s]*(\\[\\w*\\])*[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),sqrt_brackets,string)\n", " return string\n", "def give_blanks(string):\n", " string = re.sub(r\"(sqrt[\\w])\",insert_a_blank,string)\n", " string = re.sub(r\"(frac[\\w])\",insert_a_blank,string)\n", " return string\n", "def give_brackets(string):\n", " string = re.sub(r\"leftbracket\\d\",\"\",string)\n", " string = re.sub(r\"rightbracket\\d\",\"\",string)\n", " string = re.sub(r\"leftset\",r\"\\{\",string)\n", " string = re.sub(r\"rightset\",r\"\\}\",string)\n", " return string\n", "#以上是20220715新加的文本处理机制\n", "def initial_bracket_search(string):\n", " t = re.search(r\"^[\\s]*?leftbracket(\\d)\",string)\n", " if t == None:\n", " return -1\n", " else:\n", " return t.span()[1]\n", "def initial_brackets_pair_search(string,d):\n", " t = re.search(\"rightbracket\"+d,string)\n", " if t == None:\n", " return -1\n", " else:\n", " return t.span()[1]\n", "def refine_frac(string):\n", " eq_left = \"\"\n", " eq_right = string\n", " while re.search(\"frac\",eq_right) != None:\n", " pos = re.search(\"frac\",eq_right)\n", " eq_left += eq_right[:pos.span()[1]]\n", " eq_right = eq_right[pos.span()[1]:]\n", " if initial_bracket_search(eq_right)>0:\n", " pos = initial_brackets_pair_search(eq_right,eq_right[initial_bracket_search(eq_right)-1])\n", " first_bracket = eq_right[:pos]\n", " first_layer = first_bracket[-1]\n", " eq_remain = eq_right[pos:]\n", " if initial_bracket_search(eq_remain)>0:\n", " pos = initial_brackets_pair_search(eq_remain,eq_remain[initial_bracket_search(eq_remain)-1])\n", " second_bracket = eq_remain[:pos]\n", " second_layer = second_bracket[-1]\n", " first_bracket = re.sub(r\"leftbracket\"+first_layer,\"{\",first_bracket)\n", " second_bracket = re.sub(r\"leftbracket\"+second_layer,\"{\",second_bracket)\n", " first_bracket = re.sub(r\"rightbracket\"+first_layer,\"}\",first_bracket)\n", " second_bracket = re.sub(r\"rightbracket\"+second_layer,\"}\",second_bracket)\n", " eq_right = first_bracket+second_bracket+eq_remain[pos:]\n", " return eq_left+eq_right\n", "#以上是20220718修改的大括号处理机制, 修复了一个bug\n", "def reduce_blank(matchobj):\n", " return matchobj.group(1).replace(\" \",\"\")\n", "def add_dollars(matchobj):\n", " return matchobj.group(1)[0] + r\"$\" + matchobj.group(1)[1:-1] + r\"$\" + matchobj.group(1)[-1]\n", "def del_first_char(matchobj):\n", " return matchobj.group(1)[1:]\n", "def add_underline(matchobj):\n", " return matchobj.group(1)[0] + \"_\" + matchobj.group(1)[-1]\n", "def brackets_to_cwords(matchobj):\n", " return \"左括号\"+matchobj.group(1)+\"右括号\"\n", "def cwords_to_brackets(matchobj):\n", " return \"(\"+matchobj.group(1)+\")\"\n", "def circled_brackets(matchobj):\n", " return matchobj.group(1)[:-1]+\"{\"+matchobj.group(1)[-1] + \"}\"\n", "\n", "# try:\n", "# os.chdir(r\"D:\\mathdept\\mathdept\\文本处理程序等\")\n", "# except:\n", "# os.chdir(r\"D:\\mathdept\\文本处理程序等\")\n", "# with open(\"textfile.txt\", \"r\", encoding = \"utf8\") as textfile:\n", "# data = textfile.read()\n", "\n", "data = getCopy()\n", "\n", "\n", "#去除左右括号的前缀\n", "data = data.replace(r\"\\left.\",\"\").replace(r\"\\left\",\"\").replace(r\"\\right.\",\"\").replace(r\"\\right\",\"\")\n", "\n", "#全角半角符号替换\n", "data = re.sub(\" \",\" \",data)\n", "data = re.sub(\"(。[\\n]*)\",full_stop,data)\n", "data = re.sub(\"(.[\\n]*)\",full_stop,data)\n", "data = re.sub(\",\",\", \",data)\n", "data = re.sub(\":\",\": \",data)\n", "data = re.sub(\";\",\"; \",data)\n", "data = re.sub(\"(\",\"(\",data)\n", "data = re.sub(\")\",\")\",data)\n", "data = re.sub(\"?\",\"? \",data)\n", "data = re.sub(\"“\",\"``\",data)\n", "data = re.sub(\"”\",\"''\",data)\n", "\n", "#替换全角数字等\n", "data = re.sub(\"α\",r\"\\\\alpha\",data)\n", "data = re.sub(\"β\",r\"\\\\beta\",data)\n", "data = re.sub(\"{\",\"\\{\",data)\n", "data = re.sub(\"}\",\"\\}\",data)\n", "data = re.sub(\"∪\",r\"\\\\cup \",data)\n", "data = re.sub(\"∩\",r\"\\\\cap \",data)\n", "data = re.sub(\"∞\",r\"\\\\infty\",data)\n", "data = re.sub(\"γ\",r\"\\\\gamma\",data)\n", "data = re.sub(\"δ\",r\"\\\\delta\",data)\n", "data = re.sub(\"≤\",r\"\\\\le \",data)\n", "data = re.sub(\"≥\",r\"\\\\ge \",data)\n", "data = re.sub(\"槡\",r\"\\\\sqrt \",data)\n", "data = re.sub(\"log\",r\"\\\\log \",data)\n", "data = re.sub(\"lg\",r\"\\\\lg \",data)\n", "data = re.sub(\"ln\",r\"\\\\ln \",data)\n", "data = re.sub(\"≠\",r\"\\\\ne \",data)\n", "data = re.sub(\"π\",r\"\\\\pi \",data)\n", "data = re.sub(\"θ\",r\"\\\\theta \",data)\n", "data = re.sub(\"sin\",r\"\\\\sin \",data)\n", "data = re.sub(\"cos\",r\"\\\\cos \",data)\n", "data = re.sub(\"tan\",r\"\\\\tan \",data)\n", "data = re.sub(\"cot\",r\"\\\\cot \",data)\n", "data = re.sub(\"△\",r\"\\\\triangle \",data)\n", "data = re.sub(\"φ\",r\"\\\\varphi \",data)\n", "data = re.sub(\"ω\",r\"\\\\omega \",data)\n", "data = re.sub(\"珗\",r\"\\\\overrightarrow \",data)\n", "data = re.sub(\"珝\",r\"\\\\overrightarrow \",data)\n", "data = re.sub(\"珤\",r\"\\\\overrightarrow \",data)\n", "data = re.sub(\"珤犲\",r\"\\\\overrightarrow e_ \",data)\n", "data = re.sub(\"λ\",r\"\\\\lambda \",data)\n", "data = re.sub(\"i\",r\"\\\\mathrm{i}\",data)\n", "data = re.sub(\"∈\",r\"\\\\in \",data)\n", "data = re.sub(\"⊥\",r\"\\\\perp \",data)\n", "data = re.sub(\"∥\",r\"\\\\parallel \",data)\n", "data = re.sub(\"①\",r\"\\\\textcircled{1} \",data)\n", "data = re.sub(\"②\",r\"\\\\textcircled{2} \",data)\n", "data = re.sub(\"③\",r\"\\\\textcircled{3} \",data)\n", "data = re.sub(\"④\",r\"\\\\textcircled{4} \",data)\n", "data = re.sub(\"⑤\",r\"\\\\textcircled{5} \",data)\n", "data = re.sub(\"\",r\"\\\\subseteq \",data)\n", "data = re.sub(\"\",r\"\\\\subset \",data)\n", "data = re.sub(\"\",r\"\\\\supset \",data)\n", "data = re.sub(\"\",r\"\\\\Rightarrow \",data)\n", "data = re.sub(\"\",r\"\\\\varnothing \",data)\n", "data = re.sub(\"×\",r\"\\\\times \",data)\n", "data = re.sub(\"·\",r\"\\\\cdot \",data)\n", "data = re.sub(\"%\",r\"\\\\%\",data)\n", "data = re.sub(\"cm\",r\"\\\\text{cm}\",data)\n", "data = re.sub(\"°\",r\"^\\\\circ \",data)\n", "data = re.sub(\"∶\",r\": \",data)\n", "data = re.sub(\"m\",r\"\\\\text{m}\",data)\n", "data = re.sub(\"∠\",r\"\\\\angle \",data)\n", "data = re.sub(r\"→\\n \",r\"\",data)\n", "data = re.sub(\"〈\",r\"\\\\langle \",data)\n", "data = re.sub(\"〉\",r\"\\\\rangle \",data)\n", "data = re.sub(\"…\",r\"\\\\cdots\",data)\n", "data = re.sub(\"P\",r\"\\\\mathrm{P}^\",data)\n", "data = re.sub(\"\",r\"\\\\supseteq\",data)\n", "data = re.sub(\"e\",r\"\\\\mathrm{e}\",data)\n", "data = re.sub(\"μ\",r\"\\\\mu\",data)\n", "data = re.sub(\"ρ\",r\"\\\\rho\",data)\n", "\n", "\n", "#修改一些常用的错误latex命令\n", "data = re.sub(\"centerdot\",\"cdot\",data)\n", "data = re.sub(\"cancel\",\"not\",data)\n", "\n", "whole_numbers = \"0123456789+-=犞犎犲狆狇狉犕犖><犃犅犆犇狓犝[]|犪狔犙犽犘犚犫犛犮犈犗犿犣狀犳犵犺狋犻犼狕犉犾′犱狊犌犡犢狘\"\n", "correct_numbers = \"0123456789+-=VHepqrMN><\\s\\n'\\(\\)\\-\\+:±]{2,}[\\u4e00-\\u9fa5、\\,\\.;\\( ])\",add_dollars,data)\n", " data = re.sub(r\"([\\u4e00-\\u9fa5、\\)][0-9a-zA-Z_\\^\\\\\\{\\}|=><\\s\\n'\\(\\)\\-\\+:±]{2,}[\\u4e00-\\u9fa5、\\,\\.;\\( ])\",add_dollars,data)\n", " data = re.sub(r\"([\\u4e00-\\u9fa5、 \\)][0-9a-zA-Z]{1}[\\u4e00-\\u9fa5、\\,\\.;\\( ])\",add_dollars,data)\n", " data = re.sub(r\"左括号\\$(\\d{1,2})\\$右括号\",cwords_to_brackets,data)\n", " data = re.sub(r\"\\$\\,[\\s]*\\$\",\", \",data)\n", " data = re.sub(r\"(\\n[^(\\\\])\",del_first_char,data)\n", "\n", "\n", "\n", "#分离文字和公式\n", "raw_texts = [] #文字数组\n", "raw_equations = [] #公式数组\n", "d = data\n", "while len(d) > 0:\n", " interval = re.search(r\"\\$[\\s\\S]*?\\$\",d)\n", " if not interval == None:\n", " (start, end) = interval.span()\n", " raw_texts.append(d[:start])\n", " raw_equations.append(d[start:end])\n", " d = d[end:]\n", " else:\n", " raw_texts.append(d)\n", " d = \"\"\n", "#至此已经分离了文字和公式,公式在两个$之内,包含两个$\n", "\n", "modified_texts = []\n", "modified_equations = []\n", "\n", "for text in raw_texts:\n", " text1 = text\n", " #删除选项中无用的空格\n", " text1 = re.sub(\"\\{[\\s]+?\",\"{\",text1)\n", " text1 = re.sub(\"[\\s]+?\\}\",\"}\",text1)\n", " #填空题的处理\n", " text1 = re.sub(\"[ _]{5,}\",r\"\\\\blank{50}\",text1)\n", " #选择题的处理\n", " text1 = re.sub(r\"\\(\\\\blank\\{50\\}\\)\",\"\\\\\\\\bracket{20}\",text1)\n", " text1 = re.sub(r\"\\([\\s]{1,10}\\)\",\"\\\\\\\\bracket{20}\",text1)\n", " #逗号后面加空格\n", " text1 = re.sub(\",[ ]*\",\", \",text1)\n", " text1 = re.sub(r\"\\.\\}\",\"}\",text1)\n", " text1 = re.sub(r\"\\n\\d{1,3}\\.\",r\"\\n\\\\item \",text1)\n", " text1 = re.sub(r\"\\s{3,}\\.\",r\"\\\\blank{50}.\",text1)\n", " text1 = re.sub(r\"\\s{3,}\\,\",r\"\\\\blank{50},\",text1)\n", " text1 = re.sub(r\"\\\\bracket\\{20\\}\\n\",r\"\\\\bracket{20}.\\n\",text1)\n", " text1 = re.sub(r\"\\\\mathrm{\\\\mathrm{i}}\",r\"\\\\mathrm{i}\",text1)\n", " text1 = re.sub(r\"\\\\\\\\mathrm{i}n\",r\"\\\\in \",text1)\n", " modified_texts.append(text1)\n", "\n", "for equation in raw_equations:\n", " equation1 = equation\n", " # 去除单个字周围的大括号和去除双重无意义的大括号\n", " for i in range(20):\n", " equation1 = re.sub(\"(\\{[\\w\\+\\-\\*]?\\})\",refine_brackets,equation1)\n", " for i in range(20):\n", " equation1 = re.sub(\"(\\{\\{[^\\{\\}]*?\\}\\})\",refine_brackets,equation1)\n", " #去除公式中的无意义的空格\n", " equation1 = re.sub(\"\\\\\\\\[!,]|\\\\\\\\quad|\\\\\\\\qquad\",\"\",equation1)\n", " equation1 = re.sub(\"\\{ *\\}\",\"\",equation1)\n", " equation1 = re.sub(\"\\( *\",\"(\",equation1)\n", " equation1 = re.sub(\" *\\)\",\")\",equation1)\n", " equation1 = re.sub(\"\\$ *\",\"$\",equation1)\n", " equation1 = re.sub(\" *\\$\",\"$\",equation1)\n", " for i in range(2):\n", " equation1 = re.sub(r\"([A-Z0-9]) ([A-Z0-9])\",lambda matchobj: matchobj.group(1)+matchobj.group(2),equation1)\n", " #改善大括号20220715\n", " layer = 0\n", " equation1 = re.sub(r\"\\\\\\{\",\"leftset\",equation1)\n", " equation1 = re.sub(r\"\\\\\\}\",\"rightset\",equation1)\n", " for layer in range(7):\n", " equation1 = re.sub(r\"\\{([^\\{\\}]*)\\}\",rename_bracket,equation1)\n", " equation1 = refine_sqrt(equation1)\n", " equation1 = refine_vital_bracket(refine_single_second_frac(refine_frac(equation1)))\n", " equation1 = give_blanks(equation1)\n", " equation1 = give_brackets(equation1)\n", " \n", " #改善交集和并集\n", " equation1 = equation1.replace(\"\\\\bigc\",\"\\\\c\")\n", " #在数集中的数集改为粗黑体\n", " equation1 = re.sub(\"\\\\\\i(n[ ]*[R|Q|Z|N|C])\",boldsymbols,equation1)\n", " equation1 = re.sub(r\"\\\\in[\\s]*?\\{([NZQRC])\\^\\*\\}\",boldsymbols_star,equation1)\n", " equation1 = re.sub(r\"\\\\mathbf([ZRNQC])\",blackboardbold,equation1)\n", " equation1 = re.sub(r\"\\\\text([ZRNQC])\",blackboardbold,equation1)\n", " equation1 = re.sub(\"operatorname\",\"mathbf \",equation1)\n", " #equation1 = re.sub(\"\\$([R|Q|Z|N|C])\\$\",singleboldsymbols,equation1)\n", " #有关数列极限\n", " equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop\\\\lim }\\\\,\",limit,equation1) \n", " equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop{lim}}\\\\,\",limit,equation1) \n", " equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop{\\\\lim }}\\\\,\",limit,equation1)\n", " equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop{\\\\lim }}\",limit,equation1)\n", " equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop{\\\\lim }\",limit,equation1)\n", " #修改centerdot\n", " equation1 = re.sub(r\"centerdot\",r\"cdot\",equation1)\n", " #分情况和方程组的处理\n", " equation1 = re.sub(\"align\\}[\\s]*\",\"cases}\",equation1)\n", " equation1 = re.sub(r\"\\\\\\\\[\\s]*\",\"\\\\\\\\\\\\\\\\\",equation1)\n", " equation1 = re.sub(r\"&\",r\"\",equation1)\n", " equation1 = re.sub(r\"\\\\\\{[\\s]*\\\\begin\",r\"\\\\begin\",equation1)\n", " equation1 = re.sub(r\"\\\\\\\\\\\\end\",r\"\\\\end\",equation1)\n", " #分式变displaystyle\n", " equation1 = re.sub(r\"\\\\frac\",r\"\\\\dfrac\",equation1)\n", " #处理多余的斜杠空格\n", " equation1 = re.sub(r\"\\\\[\\s]*?,[\\s]*?\\\\\",\",\",equation1)\n", " #处理三个点的写法\n", " equation1 = re.sub(r\"\\\\cdot[\\s]*?\\\\cdot[\\s]*?\\\\cdot\",r\"\\\\cdots\",equation1)\n", " #\\bot改为\\perp\n", " equation1 = re.sub(r\"\\\\bot\",r\"\\\\perp\",equation1)\n", " #\\texti等改为\\mathrm{i}\n", " equation1 = re.sub(r\"\\\\texti\",r\"\\\\mathrm{i}\",equation1)\n", " equation1 = re.sub(r\"\\\\mathrmi\",r\"\\\\mathrm{i}\",equation1)\n", " #处理矩阵与行列式\n", " equation1 = re.sub(r\"\\([\\s]*?\\\\begin\\{matrix\\}\",r\"\\\\begin{pmatrix}\",equation1)\n", " equation1 = re.sub(r\"\\\\end\\{matrix\\}[\\s]*?\\)\",r\"\\\\end{pmatrix}\",equation1)\n", " equation1 = re.sub(r\"\\|[\\s]*?\\\\begin\\{matrix\\}\",r\"\\\\begin{vmatrix}\",equation1)\n", " equation1 = re.sub(r\"\\\\end\\{matrix\\}[\\s]*?\\|\",r\"\\\\end{vmatrix}\",equation1)\n", " equation1 = re.sub(r\"\\\\Delta\",r\"\\\\triangle\",equation1)\n", " equation1 = re.sub(r\"\\\\vartriangle\",r\"\\\\triangle\",equation1)\n", " equation1 = re.sub(r\"\\\\\\{\\s*\\.\\s*\",r\"\\\\{\",equation1)\n", " equation1 = re.sub(r\"\\s*\\|\\s*\",\"|\",equation1)\n", " equation1 = re.sub(r\"\\s*\\\\\\}\",r\"\\\\}\",equation1)\n", " equation1 = re.sub(r\"\\\\\\{\\s*\",r\"\\\\{\",equation1)\n", " equation1 = re.sub(r\"\\{ *([ZRNQC])\\^([\\+\\-*]) *\\}\",mathbf,equation1)\n", " equation1 = re.sub(r\"\\{\\\\log *\\}_\",r\"\\\\log_\",equation1)\n", " equation1 = re.sub(r\"([^\\\\]\\s+?\\])\",reduce_blank,equation1)\n", " equation1 = re.sub(r\"([^\\\\]\\s+?\\))\",reduce_blank,equation1)\n", " equation1 = re.sub(r\"(\\(\\s+?)\",reduce_blank,equation1)\n", " equation1 = re.sub(r\"(\\[\\s+?)\",reduce_blank,equation1)\n", " modified_equations.append(equation1)\n", "\n", "\n", "#整合修改过的文本和公式 \n", "modified_data = \"\"\n", "for i in range(len(modified_texts)):\n", " try:\n", " modified_data += modified_texts[i]\n", " except:\n", " a = 1\n", " try:\n", " modified_data += modified_equations[i]\n", " except:\n", " a = 1\n", "modified_data = re.sub(r\"[ ]+\\n\",\"\\n\",modified_data)\n", "modified_data = re.sub(r\"\\$[\\s]*?\\\\parallel[\\s]*?\\$\",r\"\\\\parallel\",modified_data)\n", "modified_data = re.sub(r\"\\n例\\s*?\\d{1,3}\\s*\",r\"\\n\\\\item \",modified_data)\n", "modified_data = re.sub(r\"ABCDA_1B_1C_1D_1\",r\"$ABCD-A_1B_1C_1D_1$\",modified_data)\n", "modified_data = re.sub(r\"(\\$[\\,\\.:;]\\$)\",refine_brackets,modified_data)\n", "\n", "\n", "#以下是扒上教社教材题目时作的改动\n", "#modified_data = re.sub(r\"log[\\s]\",r\"log_\",modified_data)\n", "modified_data = re.sub(r\"(textcircled[\\d])\",circled_brackets,modified_data)\n", "\n", "#以下是为\\log_瘦身\n", "modified_data = re.sub(r\"log[\\s]+_\",r\"log_\",modified_data)\n", "\n", "#以下是mathpix之后的空格去除\n", "for i in range(3):\n", " modified_data = re.sub(r\"([\\u4e00-\\u9fa5])( )([\\u4e00-\\u9fa5])\",lambda x:x.group(1)+x.group(3),modified_data)\n", " modified_data = re.sub(r\"\\$ \",\"$\",modified_data)\n", " modified_data = re.sub(r\" \\$\",\"$\",modified_data)\n", "#mathpix的错别字修改\n", "modified_data = modified_data.replace(\"雉\",\"锥\")\n", "modified_data = modified_data.replace(\"粗圆\",\"椭圆\")\n", "modified_data = modified_data.replace(\"针角\",\"钝角\")\n", "#mathpix的自由向量修改\n", "modified_data = modified_data.replace(r\"\\vec\",r\"\\overrightarrow \")\n", "modified_data = modified_data.replace(r\"\\bar\",r\"\\overline \")\n", "#mathpix的极限修改\n", "modified_data = modified_data.replace(r\"\\lim _{n \\rightarrow \\infty}\",r\"\\displaystyle\\lim_{n\\to\\infty}\")\n", "#mathpix的顿号修改\n", "modified_data = modified_data.replace(r\" 、 \",r\"$、$\")\n", "#改slant等\n", "modified_data = modified_data.replace(r\"slant\",\"\")\n", "modified_data = modified_data.replace(r\"\\mid\",\"|\")\n", "modified_data = re.sub(r\"\\\\mathrm\\{\\\\mathrm\\{i\\}\\}\",r\"\\\\mathrm{i}\",modified_data)\n", "modified_data = modified_data.replace(\",$\",\", $\")\n", "modified_data = modified_data.replace(\" / /\",r\"\\parallel\")\n", "modified_data = modified_data.replace(\"mathrmR\",r\"mathbf{R}\")\n", "modified_data = modified_data.replace(r\"^{\\prime}\",\"'\")\n", "modified_data = re.sub(r\"\\^\\{\\\\dfrac\",r\"^{\\\\frac\",modified_data)\n", "modified_data = re.sub(r\"\\^\\{-\\\\dfrac\",r\"^{-\\\\frac\",modified_data)\n", "modified_data = re.sub(r\"_\\{\\\\dfrac\",r\"^{_{\\\\frac\",modified_data)\n", "modified_data = re.sub(r\"_\\{-\\\\dfrac\",r\"^{_{-\\\\frac\",modified_data)\n", "\n", "\n", "modified_data = re.sub(r\"\\\\begin\\{array\\}[rcl]*\",r\"\\\\begin{cases}\",modified_data)\n", "modified_data = re.sub(r\"\\\\end{array}\",r\"\\\\end{cases}\",modified_data)\n", "\n", "setCopy(modified_data)\n", "\n", "with open(\"临时文件/outputfile.txt\",\"w\",encoding = \"utf8\") as f:\n", " f.write(modified_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "mathdept", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15 (main, Nov 24 2022, 14:39:17) [MSC v.1916 64 bit (AMD64)]" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "ff3c292c316ba85de6f1ad75f19c731e79d694e741b6f515ec18f14996fe48dc" } } }, "nbformat": 4, "nbformat_minor": 2 }