修复mathpix预处理中 将表格中的首数字识别为题号的bug

This commit is contained in:
weiye.wang 2024-02-28 22:30:40 +08:00
parent a699f604b4
commit c05e25ad23
1 changed files with 12 additions and 1 deletions

View File

@ -1356,6 +1356,17 @@ def setCopy(string): # 写入剪切板内容
wc.SetClipboardData(win32con.CF_UNICODETEXT, string)
wc.CloseClipboard()
def itemizeProblems(string): #将题号替换为\item
string_list = string.split("\n")
itemed_list = []
for line in string_list:
if not "&" in line:
itemed_list.append(re.sub(r"(?:(?:^|\n)+[例]*[\s]*[0-9]+[\s]*[\.、\s]+|\[[\s]*例[\s]*[0-9]*[\s]*\][\s]*)","\\n\\\\item ",line))
else:
itemed_list.append(line)
string = "\n".join(itemed_list)
return string
def RefineMathpix(raw_string): # 进一步修改mathpix得到的字符串
puctuationsfulltosemi = {" ": " ","": ". ","": ". ","": ", ","": ": ","": "; ","": "(","": ")","": "? ","": "``","": "''", "": "[", "": "]"}
replacestrings = {r"\\overparen": r"\\overset\\frown", "eqslant": "eq", r"\\vec": r"\\overrightarrow ", r"\\bar": r"\\overline", r"\\lim": r"\\displaystyle\\lim", r"\\sum":r"\\displaystyle\\sum", r"\\prod":r"\\displaystyle\\prod", r"\\mid":"|", r"\^\{\\prime\}":"'",r"e\^":r"\\mathrm{e}^",r"/\s*/":r"\\parallel "}
@ -1373,7 +1384,7 @@ def RefineMathpix(raw_string): # 进一步修改mathpix得到的字符串
for s in wrongrecog:
string = re.sub(s,wrongrecog[s],string) #修改mathpix识别的一些常见错别字
string = re.sub(r"[\s]*(``|''|\}|\{) *",lambda matchobj: matchobj.group(1),string) #去除符号前后的空格
string = re.sub(r"(?:(?:^|\n)+[例]*[\s]*[0-9]+[\s]*[\.、\s]+|\[[\s]*例[\s]*[0-9]*[\s]*\][\s]*)","\\n\\\\item ",string) #将题号替换为\item
string = itemizeProblems(string) #将题号替换为\item
string = re.sub(r"\$\$","$",string) #行间公式替换为行内公式
string = re.sub(r"\$\s+\$"," ",string) #删除多余的$符号
string = re.sub(r"([,.:;?!])\$",lambda x:x.group(1)+" $",string) #标点和$符号分开