This repository has been archived on 2024-06-23. You can view files and clone it, but cannot push or open issues or pull requests.
mathdeptv2/文本处理工具/文本整理_图片转文字后.ipynb

485 lines
25 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os,re\n",
"\n",
"def full_stop(matchobj):\n",
" if matchobj.group(1) == \"。\" or matchobj.group(1) == \"\":\n",
" return \". \"\n",
" else:\n",
" return \".\\n\"\n",
"def refine_brackets(matchobj):\n",
" return matchobj.group(1)[1:-1]\n",
"def insert_a_blank(matchobj):\n",
" return matchobj.group(1)[:-1]+\" \"+matchobj.group(1)[-1]\n",
"def multiple_choice(matchobj):\n",
" string = \"\\\\fourch\" + \"{\" + matchobj.group(1) + \"}{\" + matchobj.group(2) + \"}{\" + matchobj.group(3) + \"}{\" + matchobj.group(4) + \"}\\n\"\n",
" return string\n",
"def boldsymbols(matchobj):\n",
" return \"\\\\i\"+matchobj.group(1)[:-1]+\"\\\\mathbf{\"+matchobj.group(1)[-1]+\"}\"\n",
"def boldsymbols_star(matchobj):\n",
" return \"\\\\in \\\\mathbf{\"+matchobj.group(1)+\"}^*\"\n",
"def singleboldsymbols(matchobj):\n",
" return \"$\\\\mathbf{\" + matchobj.group(1) + \"}$\"\n",
"def blackboardbold(matchobj):\n",
" string = \"\\\\mathbf\" + \"{\" + matchobj.group(1) + \"}\"\n",
" return string\n",
"def limit(matchobj):\n",
" return \"\\\\displaystyle\\\\lim_{\"+matchobj.group(1)+\"}\"\n",
"def replace_i(matchobj):\n",
" string = matchobj.group(1)\n",
" length = len(string)\n",
" for i in range(length-1,-1,-1):\n",
" if string[i] == \"i\" and not \"item\" in string[i:] and not \"overline\" in string[i:]:\n",
" string = string[:i] + \"\\\\mathrm{i}\" + string[i+1:]\n",
" return string\n",
"def refine_log(matchobj):\n",
" return r\"\\log_\"+matchobj.group(1)\n",
"def refine_powers(matchobj):\n",
" base = matchobj.group(1)\n",
" power = matchobj.group(2)\n",
" return base + \"^\" + power\n",
"def refine_sequences(matchobj):\n",
" return \"\\{\" + matchobj.group(1) + \"\\}\"\n",
"def refine_starting_brackets(matchobj):\n",
" return \"$\" + matchobj.group(1)\n",
"def refine_left_operating_brackets(matchobj):\n",
" obj = matchobj.group(2)\n",
" return matchobj.group(1)+obj\n",
"def refine_right_operating_brackets(matchobj):\n",
" obj = matchobj.group(1)\n",
" return obj + matchobj.group(2)\n",
"def refine_brackets_in_brackets(matchobj):\n",
" return matchobj.group(1) + matchobj.group(2) + matchobj.group(3)\n",
"def mathbf(matchobj):\n",
" return \"\\\\mathbf{\" + matchobj.group(1) + \"}^\" + matchobj.group(2)\n",
"#以上是202207之前的文本处理机制\n",
"global layer\n",
"def rename_bracket(matchobj):\n",
" return \"leftbracket\" + str(layer) + matchobj.group(1) + \"rightbracket\" + str(layer)\n",
"def frac_brackets(matchobj):\n",
" return \"frac{\"+matchobj.group(1)+\"}{\"+matchobj.group(2)+\"}\"\n",
"def frac_single_second_bracket(matchobj):\n",
" return \"frac \"+matchobj.group(1)+\"{\"+matchobj.group(2)+\"}\"\n",
"def recall_vital_bracket(matchobj):\n",
" return matchobj.group(1) + \"{\" + matchobj.group(2) + \"}\"\n",
"def sqrt_brackets(matchobj):\n",
" if matchobj.group(1) == None:\n",
" first_group = \"\"\n",
" else:\n",
" first_group = matchobj.group(1)\n",
" return \"sqrt \"+ first_group +\"{\" + matchobj.group(2) + \"}\"\n",
"#def refine_frac(string):\n",
"# for s in range(7):\n",
"# for t in range(7):\n",
"# string = re.sub(r\"frac[\\s]*leftbracket\"+str(s)+\"(.*?)\"+r\"rightbracket\"+str(s)+\"[\\s]*\"+r\"leftbracket\"+str(t)+\"(.*?)\"+r\"rightbracket\"+str(t),frac_brackets,string)\n",
"# return string\n",
"def refine_single_second_frac(string):\n",
" for s in range(7):\n",
" string = re.sub(r\"frac[\\s]*(\\w)[\\s]*leftbracket\"+str(s)+\"(.*?)\"+r\"rightbracket\"+str(s),frac_single_second_bracket,string)\n",
" return string\n",
"def refine_vital_bracket(string):\n",
" for s in range(7):\n",
" string = re.sub(r\"(frac)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n",
" string = re.sub(r\"(line)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n",
" string = re.sub(r\"(arrow)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n",
" string = re.sub(r\"(_)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n",
" string = re.sub(r\"(\\^)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n",
" string = re.sub(r\"(mathrm)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n",
" string = re.sub(r\"(mathbf)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n",
" string = re.sub(r\"(begin)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n",
" string = re.sub(r\"(end)[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),recall_vital_bracket,string)\n",
" return string\n",
"def refine_sqrt(string):\n",
" for s in range(7):\n",
" string = re.sub(r\"sqrt[\\s]*(\\[\\w*\\])*[\\s]*leftbracket\"+str(s)+\"(.*?)rightbracket\"+str(s),sqrt_brackets,string)\n",
" return string\n",
"def give_blanks(string):\n",
" string = re.sub(r\"(sqrt[\\w])\",insert_a_blank,string)\n",
" string = re.sub(r\"(frac[\\w])\",insert_a_blank,string)\n",
" return string\n",
"def give_brackets(string):\n",
" string = re.sub(r\"leftbracket\\d\",\"\",string)\n",
" string = re.sub(r\"rightbracket\\d\",\"\",string)\n",
" string = re.sub(r\"leftset\",r\"\\{\",string)\n",
" string = re.sub(r\"rightset\",r\"\\}\",string)\n",
" return string\n",
"#以上是20220715新加的文本处理机制\n",
"def initial_bracket_search(string):\n",
" t = re.search(r\"^[\\s]*?leftbracket(\\d)\",string)\n",
" if t == None:\n",
" return -1\n",
" else:\n",
" return t.span()[1]\n",
"def initial_brackets_pair_search(string,d):\n",
" t = re.search(\"rightbracket\"+d,string)\n",
" if t == None:\n",
" return -1\n",
" else:\n",
" return t.span()[1]\n",
"def refine_frac(string):\n",
" eq_left = \"\"\n",
" eq_right = string\n",
" while re.search(\"frac\",eq_right) != None:\n",
" pos = re.search(\"frac\",eq_right)\n",
" eq_left += eq_right[:pos.span()[1]]\n",
" eq_right = eq_right[pos.span()[1]:]\n",
" if initial_bracket_search(eq_right)>0:\n",
" pos = initial_brackets_pair_search(eq_right,eq_right[initial_bracket_search(eq_right)-1])\n",
" first_bracket = eq_right[:pos]\n",
" first_layer = first_bracket[-1]\n",
" eq_remain = eq_right[pos:]\n",
" if initial_bracket_search(eq_remain)>0:\n",
" pos = initial_brackets_pair_search(eq_remain,eq_remain[initial_bracket_search(eq_remain)-1])\n",
" second_bracket = eq_remain[:pos]\n",
" second_layer = second_bracket[-1]\n",
" first_bracket = re.sub(r\"leftbracket\"+first_layer,\"{\",first_bracket)\n",
" second_bracket = re.sub(r\"leftbracket\"+second_layer,\"{\",second_bracket)\n",
" first_bracket = re.sub(r\"rightbracket\"+first_layer,\"}\",first_bracket)\n",
" second_bracket = re.sub(r\"rightbracket\"+second_layer,\"}\",second_bracket)\n",
" eq_right = first_bracket+second_bracket+eq_remain[pos:]\n",
" return eq_left+eq_right\n",
"#以上是20220718修改的大括号处理机制, 修复了一个bug\n",
"def reduce_blank(matchobj):\n",
" return matchobj.group(1).replace(\" \",\"\")\n",
"def add_dollars(matchobj):\n",
" return matchobj.group(1)[0] + r\"$\" + matchobj.group(1)[1:-1] + r\"$\" + matchobj.group(1)[-1]\n",
"def del_first_char(matchobj):\n",
" return matchobj.group(1)[1:]\n",
"def add_underline(matchobj):\n",
" return matchobj.group(1)[0] + \"_\" + matchobj.group(1)[-1]\n",
"def brackets_to_cwords(matchobj):\n",
" return \"左括号\"+matchobj.group(1)+\"右括号\"\n",
"def cwords_to_brackets(matchobj):\n",
" return \"(\"+matchobj.group(1)+\")\"\n",
"def circled_brackets(matchobj):\n",
" return matchobj.group(1)[:-1]+\"{\"+matchobj.group(1)[-1] + \"}\"\n",
"\n",
"\n",
"with open(\"临时文件/textfile.txt\", \"r\", encoding = \"utf8\") as textfile:\n",
" data = textfile.read()\n",
"\n",
"\n",
"\n",
"#去除左右括号的前缀\n",
"data = data.replace(r\"\\left.\",\"\").replace(r\"\\left\",\"\").replace(r\"\\right.\",\"\").replace(r\"\\right\",\"\")\n",
"\n",
"#全角半角符号替换\n",
"data = re.sub(\" \",\" \",data)\n",
"data = re.sub(\"(。[\\n]*)\",full_stop,data)\n",
"data = re.sub(\"([\\n]*)\",full_stop,data)\n",
"data = re.sub(\"\",\", \",data)\n",
"data = re.sub(\"\",\": \",data)\n",
"data = re.sub(\"\",\"; \",data)\n",
"data = re.sub(\"\",\"(\",data)\n",
"data = re.sub(\"\",\")\",data)\n",
"data = re.sub(\"\",\"? \",data)\n",
"data = re.sub(\"“\",\"``\",data)\n",
"data = re.sub(\"”\",\"''\",data)\n",
"\n",
"#替换全角数字等\n",
"data = re.sub(\"α\",r\"\\\\alpha\",data)\n",
"data = re.sub(\"β\",r\"\\\\beta\",data)\n",
"data = re.sub(\"\",\"\\{\",data)\n",
"data = re.sub(\"\",\"\\}\",data)\n",
"data = re.sub(\"\",r\"\\\\cup \",data)\n",
"data = re.sub(\"∩\",r\"\\\\cap \",data)\n",
"data = re.sub(\"∞\",r\"\\\\infty\",data)\n",
"data = re.sub(\"γ\",r\"\\\\gamma\",data)\n",
"data = re.sub(\"δ\",r\"\\\\delta\",data)\n",
"data = re.sub(\"≤\",r\"\\\\le \",data)\n",
"data = re.sub(\"≥\",r\"\\\\ge \",data)\n",
"data = re.sub(\"槡\",r\"\\\\sqrt \",data)\n",
"data = re.sub(\"\",r\"\\\\log \",data)\n",
"data = re.sub(\"\",r\"\\\\lg \",data)\n",
"data = re.sub(\"\",r\"\\\\ln \",data)\n",
"data = re.sub(\"≠\",r\"\\\\ne \",data)\n",
"data = re.sub(\"π\",r\"\\\\pi \",data)\n",
"data = re.sub(\"θ\",r\"\\\\theta \",data)\n",
"data = re.sub(\"\",r\"\\\\sin \",data)\n",
"data = re.sub(\"\",r\"\\\\cos \",data)\n",
"data = re.sub(\"\",r\"\\\\tan \",data)\n",
"data = re.sub(\"\",r\"\\\\cot \",data)\n",
"data = re.sub(\"△\",r\"\\\\triangle \",data)\n",
"data = re.sub(\"φ\",r\"\\\\varphi \",data)\n",
"data = re.sub(\"ω\",r\"\\\\omega \",data)\n",
"data = re.sub(\"珗\",r\"\\\\overrightarrow \",data)\n",
"data = re.sub(\"珝\",r\"\\\\overrightarrow \",data)\n",
"data = re.sub(\"珤\",r\"\\\\overrightarrow \",data)\n",
"data = re.sub(\"珤犲\",r\"\\\\overrightarrow e_ \",data)\n",
"data = re.sub(\"λ\",r\"\\\\lambda \",data)\n",
"data = re.sub(\"\",r\"\\\\mathrm{i}\",data)\n",
"data = re.sub(\"∈\",r\"\\\\in \",data)\n",
"data = re.sub(\"⊥\",r\"\\\\perp \",data)\n",
"data = re.sub(\"∥\",r\"\\\\parallel \",data)\n",
"data = re.sub(\"①\",r\"\\\\textcircled{1} \",data)\n",
"data = re.sub(\"②\",r\"\\\\textcircled{2} \",data)\n",
"data = re.sub(\"③\",r\"\\\\textcircled{3} \",data)\n",
"data = re.sub(\"④\",r\"\\\\textcircled{4} \",data)\n",
"data = re.sub(\"⑤\",r\"\\\\textcircled{5} \",data)\n",
"data = re.sub(\"\",r\"\\\\subseteq \",data)\n",
"data = re.sub(\"\",r\"\\\\subset \",data)\n",
"data = re.sub(\"\",r\"\\\\supset \",data)\n",
"data = re.sub(\"\",r\"\\\\Rightarrow \",data)\n",
"data = re.sub(\"\",r\"\\\\varnothing \",data)\n",
"data = re.sub(\"×\",r\"\\\\times \",data)\n",
"data = re.sub(\"·\",r\"\\\\cdot \",data)\n",
"data = re.sub(\"\",r\"\\\\%\",data)\n",
"data = re.sub(\"\",r\"\\\\text{cm}\",data)\n",
"data = re.sub(\"°\",r\"^\\\\circ \",data)\n",
"data = re.sub(\"\",r\": \",data)\n",
"data = re.sub(\"\",r\"\\\\text{m}\",data)\n",
"data = re.sub(\"∠\",r\"\\\\angle \",data)\n",
"data = re.sub(r\"→\\n \",r\"\",data)\n",
"data = re.sub(\"〈\",r\"\\\\langle \",data)\n",
"data = re.sub(\"〉\",r\"\\\\rangle \",data)\n",
"data = re.sub(\"…\",r\"\\\\cdots\",data)\n",
"data = re.sub(\"\",r\"\\\\mathrm{P}^\",data)\n",
"data = re.sub(\"\",r\"\\\\supseteq\",data)\n",
"data = re.sub(\"\",r\"\\\\mathrm{e}\",data)\n",
"data = re.sub(\"μ\",r\"\\\\mu\",data)\n",
"data = re.sub(\"ρ\",r\"\\\\rho\",data)\n",
"\n",
"\n",
"#修改一些常用的错误latex命令\n",
"data = re.sub(\"centerdot\",\"cdot\",data)\n",
"data = re.sub(\"cancel\",\"not\",data)\n",
"\n",
"whole_numbers = \"0123456789+-=犞犎犲狆狇狉犕犖><犃犅犆犇狓犝[]|犪狔犙犽犘犚犫犛犮犈犗犿犣狀犳犵犺狋犻犼狕犉犾′犱狊犌犡犢狘\"\n",
"correct_numbers = \"0123456789+-=VHepqrMN><ABCDxU[]|ayQkPRbScEOmZnfghtijzFl'dsGXY|\"\n",
"\n",
"\n",
"\n",
"for i in range(len(whole_numbers)):\n",
" data = re.sub(whole_numbers[i],correct_numbers[i],data)\n",
"\n",
"data = re.sub(\"A1\",r\"A_1\",data)\n",
"data = re.sub(\"B1\",r\"B_1\",data)\n",
"data = re.sub(\"C1\",r\"C_1\",data)\n",
"data = re.sub(\"D1\",r\"D_1\",data)\n",
"#替换题号\n",
"data = re.sub(\"(\\\\n[例]*[0-9]+\\.[\\s]+)\",\"\\\\n\\\\\\\\item \",data)\n",
"\n",
"#公式标志换成$符号\n",
"data = re.sub(\"\\\\\\\\\\[\",r\"$\",data)\n",
"data = re.sub(\"\\\\\\\\\\]\",r\"$\",data)\n",
"data = re.sub(\"\\$\\$\",\"\",data)\n",
"\n",
"#选择题替换成标准格式\n",
"data = re.sub(\"A\\.([\\s\\S]*?)B\\.([\\s\\S]*?)C\\.([\\s\\S]*?)D\\.([\\s\\S]*?)\\\\n\",multiple_choice,data)\n",
"data = re.sub(\"\\(A\\)([\\s\\S]*?)\\(B\\)([\\s\\S]*?)\\(C\\)([\\s\\S]*?)\\(D\\)([\\s\\S]*?)\\\\n\",multiple_choice,data)\n",
"data = re.sub(\"\\.([\\s\\S]*?)\\.([\\s\\S]*?)\\.([\\s\\S]*?)\\.([\\s\\S]*?)\\\\n\",multiple_choice,data)\n",
"data = re.sub(\"\\(\\)([\\s\\S]*?)\\(\\)([\\s\\S]*?)\\(\\)([\\s\\S]*?)\\(\\)([\\s\\S]*?)\\\\n\",multiple_choice,data)\n",
"data = re.sub(\"\\$[ ]+\\}\",\"$}\",data)\n",
"data = re.sub(\"\\{[ ]+\\$\",\"{$\",data)\n",
"#data = re.sub(\"\",r\"\\\\mathrm{C}^\",data)\n",
"#替换多余的空行\n",
"for i in range(20):\n",
" data = re.sub(\"\\n[\\t ]*\\n\",\"\\n\",data)\n",
"#复数变成正体i\n",
"data = re.sub(\"(\\\\n.*?复数.*?\\\\n)\",replace_i,data)\n",
"\n",
"data1 = data #替换后暂存data1\n",
"\n",
"\n",
"#针对从教材里扒下来的文字\n",
"data = re.sub(r\"\\((\\d{1,2})\\)\",brackets_to_cwords,data)\n",
"data = re.sub(r\"([A-Z][0-9])\",add_underline,data)\n",
"data = re.sub(r\"([\\u4e00-\\u9fa5、 \\)][0-9a-zA-Z_\\^\\\\\\{\\}|=><\\s\\n'\\(\\)\\-\\+:±]{2,}[\\u4e00-\\u9fa5、\\,\\.;\\( ])\",add_dollars,data)\n",
"data = re.sub(r\"([\\u4e00-\\u9fa5、\\)][0-9a-zA-Z_\\^\\\\\\{\\}|=><\\s\\n'\\(\\)\\-\\+:±]{2,}[\\u4e00-\\u9fa5、\\,\\.;\\( ])\",add_dollars,data)\n",
"data = re.sub(r\"([\\u4e00-\\u9fa5、 \\)][0-9a-zA-Z]{1}[\\u4e00-\\u9fa5、\\,\\.;\\( ])\",add_dollars,data)\n",
"data = re.sub(r\"左括号\\$(\\d{1,2})\\$右括号\",cwords_to_brackets,data)\n",
"data = re.sub(r\"\\$\\,[\\s]*\\$\",\", \",data)\n",
"data = re.sub(r\"(\\n[^(\\\\])\",del_first_char,data)\n",
"\n",
"\n",
"\n",
"#分离文字和公式\n",
"raw_texts = [] #文字数组\n",
"raw_equations = [] #公式数组\n",
"d = data\n",
"while len(d) > 0:\n",
" interval = re.search(r\"\\$[\\s\\S]*?\\$\",d)\n",
" if not interval == None:\n",
" (start, end) = interval.span()\n",
" raw_texts.append(d[:start])\n",
" raw_equations.append(d[start:end])\n",
" d = d[end:]\n",
" else:\n",
" raw_texts.append(d)\n",
" d = \"\"\n",
"#至此已经分离了文字和公式,公式在两个$之内,包含两个$\n",
"\n",
"modified_texts = []\n",
"modified_equations = []\n",
"\n",
"for text in raw_texts:\n",
" text1 = text\n",
" #删除选项中无用的空格\n",
" text1 = re.sub(\"\\{[\\s]+?\",\"{\",text1)\n",
" text1 = re.sub(\"[\\s]+?\\}\",\"}\",text1)\n",
" #填空题的处理\n",
" text1 = re.sub(\"[ _]{5,}\",r\"\\\\blank{50}\",text1)\n",
" #选择题的处理\n",
" text1 = re.sub(r\"\\(\\\\blank\\{50\\}\\)\",\"\\\\\\\\bracket{20}\",text1)\n",
" text1 = re.sub(r\"\\([\\s]{1,10}\\)\",\"\\\\\\\\bracket{20}\",text1)\n",
" #逗号后面加空格\n",
" text1 = re.sub(\",[ ]*\",\", \",text1)\n",
" text1 = re.sub(r\"\\.\\}\",\"}\",text1)\n",
" text1 = re.sub(r\"\\n\\d{1,3}\\.\",r\"\\n\\\\item \",text1)\n",
" text1 = re.sub(r\"\\s{3,}\\.\",r\"\\\\blank{50}.\",text1)\n",
" text1 = re.sub(r\"\\s{3,}\\,\",r\"\\\\blank{50},\",text1)\n",
" text1 = re.sub(r\"\\\\bracket\\{20\\}\\n\",r\"\\\\bracket{20}.\\n\",text1)\n",
" text1 = re.sub(r\"\\\\mathrm{\\\\mathrm{i}}\",r\"\\\\mathrm{i}\",text1)\n",
" text1 = re.sub(r\"\\\\\\\\mathrm{i}n\",r\"\\\\in \",text1)\n",
" modified_texts.append(text1)\n",
"\n",
"for equation in raw_equations:\n",
" equation1 = equation\n",
" # 去除单个字周围的大括号和去除双重无意义的大括号\n",
" for i in range(20):\n",
" equation1 = re.sub(\"(\\{[\\w\\+\\-\\*]?\\})\",refine_brackets,equation1)\n",
" for i in range(20):\n",
" equation1 = re.sub(\"(\\{\\{[^\\{\\}]*?\\}\\})\",refine_brackets,equation1)\n",
" #去除公式中的无意义的空格\n",
" equation1 = re.sub(\"\\\\\\\\[!,]|\\\\\\\\quad|\\\\\\\\qquad\",\"\",equation1)\n",
" equation1 = re.sub(\"\\{ *\\}\",\"\",equation1)\n",
" equation1 = re.sub(\"\\( *\",\"(\",equation1)\n",
" equation1 = re.sub(\" *\\)\",\")\",equation1)\n",
" equation1 = re.sub(\"\\$ *\",\"$\",equation1)\n",
" equation1 = re.sub(\" *\\$\",\"$\",equation1)\n",
" #改善大括号20220715\n",
" layer = 0\n",
" equation1 = re.sub(r\"\\\\\\{\",\"leftset\",equation1)\n",
" equation1 = re.sub(r\"\\\\\\}\",\"rightset\",equation1)\n",
" for layer in range(7):\n",
" equation1 = re.sub(r\"\\{([^\\{\\}]*)\\}\",rename_bracket,equation1)\n",
" equation1 = refine_sqrt(equation1)\n",
" equation1 = refine_vital_bracket(refine_single_second_frac(refine_frac(equation1)))\n",
" equation1 = give_blanks(equation1)\n",
" equation1 = give_brackets(equation1)\n",
" \n",
" #改善交集和并集\n",
" equation1 = equation1.replace(\"\\\\bigc\",\"\\\\c\")\n",
" #在数集中的数集改为粗黑体\n",
" equation1 = re.sub(\"\\\\\\i(n[ ]*[R|Q|Z|N|C])\",boldsymbols,equation1)\n",
" equation1 = re.sub(r\"\\\\in[\\s]*?\\{([NZQRC])\\^\\*\\}\",boldsymbols_star,equation1)\n",
" equation1 = re.sub(r\"\\\\mathbf([ZRNQC])\",blackboardbold,equation1)\n",
" equation1 = re.sub(r\"\\\\text([ZRNQC])\",blackboardbold,equation1)\n",
" equation1 = re.sub(\"operatorname\",\"mathbf \",equation1)\n",
" #equation1 = re.sub(\"\\$([R|Q|Z|N|C])\\$\",singleboldsymbols,equation1)\n",
" #有关数列极限\n",
" equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop\\\\lim }\\\\,\",limit,equation1) \n",
" equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop{lim}}\\\\,\",limit,equation1) \n",
" equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop{\\\\lim }}\\\\,\",limit,equation1)\n",
" equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop{\\\\lim }}\",limit,equation1)\n",
" equation1 = re.sub(r\"\\\\underset{([\\w]\\\\to \\\\infty) }{\\\\mathop{\\\\lim }\",limit,equation1)\n",
" #修改centerdot\n",
" equation1 = re.sub(r\"centerdot\",r\"cdot\",equation1)\n",
" #分情况和方程组的处理\n",
" equation1 = re.sub(\"align\\}[\\s]*\",\"cases}\",equation1)\n",
" equation1 = re.sub(r\"\\\\\\\\[\\s]*\",\"\\\\\\\\\\\\\\\\\",equation1)\n",
" equation1 = re.sub(r\"&\",r\"\",equation1)\n",
" equation1 = re.sub(r\"\\\\\\{[\\s]*\\\\begin\",r\"\\\\begin\",equation1)\n",
" equation1 = re.sub(r\"\\\\\\\\\\\\end\",r\"\\\\end\",equation1)\n",
" #分式变displaystyle\n",
" equation1 = re.sub(r\"\\\\frac\",r\"\\\\dfrac\",equation1)\n",
" #处理多余的斜杠空格\n",
" equation1 = re.sub(r\"\\\\[\\s]*?,[\\s]*?\\\\\",\",\",equation1)\n",
" #处理三个点的写法\n",
" equation1 = re.sub(r\"\\\\cdot[\\s]*?\\\\cdot[\\s]*?\\\\cdot\",r\"\\\\cdots\",equation1)\n",
" #\\bot改为\\perp\n",
" equation1 = re.sub(r\"\\\\bot\",r\"\\\\perp\",equation1)\n",
" #\\texti等改为\\mathrm{i}\n",
" equation1 = re.sub(r\"\\\\texti\",r\"\\\\mathrm{i}\",equation1)\n",
" equation1 = re.sub(r\"\\\\mathrmi\",r\"\\\\mathrm{i}\",equation1)\n",
" #处理矩阵与行列式\n",
" equation1 = re.sub(r\"\\([\\s]*?\\\\begin\\{matrix\\}\",r\"\\\\begin{pmatrix}\",equation1)\n",
" equation1 = re.sub(r\"\\\\end\\{matrix\\}[\\s]*?\\)\",r\"\\\\end{pmatrix}\",equation1)\n",
" equation1 = re.sub(r\"\\|[\\s]*?\\\\begin\\{matrix\\}\",r\"\\\\begin{vmatrix}\",equation1)\n",
" equation1 = re.sub(r\"\\\\end\\{matrix\\}[\\s]*?\\|\",r\"\\\\end{vmatrix}\",equation1)\n",
" equation1 = re.sub(r\"\\\\Delta\",r\"\\\\triangle\",equation1)\n",
" equation1 = re.sub(r\"\\\\vartriangle\",r\"\\\\triangle\",equation1)\n",
" equation1 = re.sub(r\"\\\\\\{\\s*\\.\\s*\",r\"\\\\{\",equation1)\n",
" equation1 = re.sub(r\"\\s*\\|\\s*\",\"|\",equation1)\n",
" equation1 = re.sub(r\"\\s*\\\\\\}\",r\"\\\\}\",equation1)\n",
" equation1 = re.sub(r\"\\\\\\{\\s*\",r\"\\\\{\",equation1)\n",
" equation1 = re.sub(r\"\\{ *([ZRNQC])\\^([\\+\\-*]) *\\}\",mathbf,equation1)\n",
" equation1 = re.sub(r\"\\{\\\\log *\\}_\",r\"\\\\log_\",equation1)\n",
" equation1 = re.sub(r\"([^\\\\]\\s+?\\])\",reduce_blank,equation1)\n",
" equation1 = re.sub(r\"([^\\\\]\\s+?\\))\",reduce_blank,equation1)\n",
" equation1 = re.sub(r\"(\\(\\s+?)\",reduce_blank,equation1)\n",
" equation1 = re.sub(r\"(\\[\\s+?)\",reduce_blank,equation1)\n",
" modified_equations.append(equation1)\n",
"\n",
"\n",
"#整合修改过的文本和公式 \n",
"modified_data = \"\"\n",
"for i in range(len(modified_texts)):\n",
" try:\n",
" modified_data += modified_texts[i]\n",
" except:\n",
" a = 1\n",
" try:\n",
" modified_data += modified_equations[i]\n",
" except:\n",
" a = 1\n",
"modified_data = re.sub(r\"[ ]+\\n\",\"\\n\",modified_data)\n",
"modified_data = re.sub(r\"\\$[\\s]*?\\\\parallel[\\s]*?\\$\",r\"\\\\parallel\",modified_data)\n",
"modified_data = re.sub(r\"\\n例\\s*?\\d{1,3}\\s*\",r\"\\n\\\\item \",modified_data)\n",
"modified_data = re.sub(r\"ABCDA_1B_1C_1D_1\",r\"$ABCD-A_1B_1C_1D_1$\",modified_data)\n",
"modified_data = re.sub(r\"(\\$[\\,\\.:;]\\$)\",refine_brackets,modified_data)\n",
"modified_data = re.sub(r\"\\$\\$\",\"$\",modified_data)\n",
"\n",
"\n",
"\n",
"#以下是扒上教社教材题目时作的改动\n",
"modified_data = re.sub(r\"log[\\s]\",r\"log_\",modified_data)\n",
"modified_data = re.sub(r\"(textcircled[\\d])\",circled_brackets,modified_data)\n",
"\n",
"with open(\"临时文件/outputfile.txt\",\"w\",encoding = \"utf8\") as f:\n",
" f.write(modified_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "d311ffef239beb3b8f3764271728f3972d7b090c974f8e972fcdeedf230299ac"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}