database_tools中更新mathpix识别文本预处理功能

This commit is contained in:
weiye.wang 2023-07-16 22:33:12 +08:00
parent 9983e77a79
commit 006c4cf2f2
1 changed files with 52 additions and 2 deletions

View File

@ -937,11 +937,61 @@ def getCopy(): # 获取剪切板内容
return t
def setCopy(str): # 写入剪切板内容
def setCopy(string): # 写入剪切板内容
wc.OpenClipboard()
wc.EmptyClipboard()
wc.SetClipboardData(win32con.CF_UNICODETEXT, str)
wc.SetClipboardData(win32con.CF_UNICODETEXT, string)
wc.CloseClipboard()
def RefineMathpix(raw_string): # 进一步修改mathpix得到的字符串
puctuationsfulltosemi = {" ": " ","": ". ","": ". ","": ", ","": ": ","": "; ","": "(","": ")","": "? ","": "``","": "''", "": "[", "": "]"}
replacestrings = {r"\\overparen": r"\\overset\\frown", "eqslant": "eq", r"\\vec": r"\\overrightarrow ", r"\\bar": r"\\overline", r"\\lim": r"\\displaystyle\\lim", r"\\sum":r"\\displaystyle\\sum", r"\\prod":r"\\displaystyle\\prod", r"\\mid":"|", r"\^\{\\prime\}":"'",r"e\^":r"\\mathrm{e}^"}
wrongrecog = {"":"","[粗秿]圆":"椭圆","投郑":"投掷","抛郑":"抛掷","范目":"范围","":"","末见":"未见","末成":"未成"}
string = raw_string
string = re.sub(r"\\left(?:\.?)|\\right(?:\.?)","",string) #删去括号前的\left与\right标记
for s in puctuationsfulltosemi:
string = re.sub(s,puctuationsfulltosemi[s],string) #将部分全角标记替换为半角
for s in replacestrings:
string = re.sub(s,replacestrings[s],string) #修改部分LaTeX命令成为惯用的
for s in wrongrecog:
string = re.sub(s,wrongrecog[s],string) #修改mathpix识别的一些常见错别字
string = re.sub(r"[\s]*(``|''|\}|\{)[\s]*",lambda matchobj: matchobj.group(1),string) #去除符号前后的空格
string = re.sub(r"(?:(?:^|\n)+[例]*[\s]*[0-9]+[\s]*[\.、\s]+|\[[\s]*例[\s]*[0-9]*[\s]*\][\s]*)","\\n\\\\item ",string) #将题号替换为\item
string = re.sub(r"\$\$","$",string) #行间公式替换为行内公式
string = re.sub(r"([,.:;?!])\$",lambda x:x.group(1)+" $",string) #标点和$符号分开
string = re.sub(r"\\frac",r"\\dfrac",string) #替换frac为dfrac
string = re.sub(r"\n(?:A\.|\(A\))([\s\S]*?)(?:B\.|\(B\))([\s\S]*?)(?:C\.|\(C\))([\s\S]*?)(?:D\.|\(D\))([\s\S]*?)\n",lambda matchobj: "\n\\fourch{%s}{%s}{%s}{%s}\n"%(matchobj.group(1).strip(),matchobj.group(2).strip(),matchobj.group(3).strip(),matchobj.group(4).strip()),string) # 选择题的选择支处理
string = re.sub(r"[\.;](\}\{|\}\n)",lambda matchobj: matchobj.group(1),string) #去除选择题选项最末尾的句号或分号
string = re.sub(r"\n\s+","\n",string) #删除多余的回车
string = re.sub(r"\\q+uad","",string) #删除\quad,\qquad等
string = re.sub(r"~","",string) #删除~
string = re.sub(r"\s*\([\s]{,10}\)",r"\\bracket{20}",string)
string = re.sub(r"\s*\\bracket\{20\}\s*\n",r"\\bracket{20}.\n",string)#行末无内容括号的处理
for i in range(2):
string = re.sub(r"(\^|_)\{([0-9a-zA-Z])\}",lambda matchobj: matchobj.group(1)+matchobj.group(2),string) #删除一些无用的大括号
for i in range(2):
string = re.sub(r"([0-9A-Z])\s+([0-9A-Z])",lambda matchobj: matchobj.group(1)+matchobj.group(2),string) #合并一些公式中的无效空格
for i in range(2):
string = re.sub(r"([\u4e00-\u9fa5])\s+([\u4e00-\u9fa5])",lambda matchobj: matchobj.group(1)+matchobj.group(2),string) #合并一些文本中的无效空格
string = re.sub(r"([CP])(_[^_\^]{,5}\^)",lambda x:r"\mathrm{"+x.group(1)+"}"+x.group(2),string) #处理排列数和组合数字母的正体
string = re.sub(r"([\^_])\{([-]{0,1})\\dfrac",lambda matchobj: matchobj.group(1) + "{" + matchobj.group(2) + "\\frac",string)
string = re.sub(r"ldots",r"cdots",string) #将ldots替换为cdots
string = re.sub(r"[\\{]*\\(begin|end)\{array\}(?:\{[rcl]*\}){0,1}",lambda matchobj: "\\" + matchobj.group(1) + "{cases}",string) #将分段函数的array环境改为cases环境
string = re.sub(r"([\u4e00-\u9fa5\$])[\s]*\n\\item",lambda matchobj: matchobj.group(1)+"\\blank{50}.\n\\item",string) #给中文或公式结尾的题目最后一行加上填空的空格.
string = re.sub(r"(是|为|(?:=\$))\s*([,.;\n])",lambda matchobj: matchobj.group(1) + "\\blank{50}" + ("." if matchobj.group(2) == "\n" else "") + matchobj.group(2),string) #给行中的题目需要的地方加上空格
string = re.sub(r"([\u4e00-\u9fa5\$])(?:\\bracket\{20\})*[\.]*[\s]*\n\\fourch",lambda matchobj: matchobj.group(1)+"\\bracket{20}.\n\\fourch",string) #给中文或公式结尾的题目最后一行加上选择题的括号.
string = re.sub(r"(%[^\n]*)\\blank\{50\}\.",lambda matchobj:matchobj.group(1),string) #注释行不加\blank{50}
string = re.sub(r"[\\\\]*\n(\(\d{1,2}\))(?:(?!\n)\s)*",lambda matchobj: "\\\\\n"+matchobj.group(1)+" ",string) #新一行的小题号回车前加上换行符
string = RefineChineseComma(string) #改顿号
return string
def RefineChineseComma(string): #顿号如果在数学环境中, 则在两侧加上$符号
CommaPositions = [match.start() for match in re.finditer("",string)]
CommaPositions.reverse()
for pos in CommaPositions:
if string[:pos].count("$") % 2 == 1:
string = string[:pos].rstrip() + "$、$" + string[(pos+1):].lstrip()
return string
if __name__ == "__main__":