From ed172f60a343e89bf8964356477c52ddd8cb8b8b Mon Sep 17 00:00:00 2001 From: wangweiye7840 Date: Mon, 17 Jul 2023 16:39:59 +0800 Subject: [PATCH] =?UTF-8?q?database=5Ftools=E4=B8=AD=E7=9A=84mathpix?= =?UTF-8?q?=E6=96=87=E6=9C=AC=E9=A2=84=E5=A4=84=E7=90=86=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=BA=86=E5=AF=B9=E9=80=97=E5=8F=B7=E7=9A=84=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E5=88=86=E6=9E=90=E5=92=8C=E9=A2=84=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 工具v2/database_tools.py | 73 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/工具v2/database_tools.py b/工具v2/database_tools.py index 54c43cc5..9111ee8c 100644 --- a/工具v2/database_tools.py +++ b/工具v2/database_tools.py @@ -958,6 +958,7 @@ def RefineMathpix(raw_string): # 进一步修改mathpix得到的字符串 string = re.sub(r"[\s]*(``|''|\}|\{)[\s]*",lambda matchobj: matchobj.group(1),string) #去除符号前后的空格 string = re.sub(r"(?:(?:^|\n)+[例]*[\s]*[0-9]+[\s]*[\.、\s]+|\[[\s]*例[\s]*[0-9]*[\s]*\][\s]*)","\\n\\\\item ",string) #将题号替换为\item string = re.sub(r"\$\$","$",string) #行间公式替换为行内公式 + string = re.sub(r"\$\s+\$"," ",string) #删除多余的$符号 string = re.sub(r"([,.:;?!])\$",lambda x:x.group(1)+" $",string) #标点和$符号分开 string = re.sub(r"\\frac",r"\\dfrac",string) #替换frac为dfrac string = re.sub(r"\n(?:A\.|\(A\))([\s\S]*?)(?:B\.|\(B\))([\s\S]*?)(?:C\.|\(C\))([\s\S]*?)(?:D\.|\(D\))([\s\S]*?)\n",lambda matchobj: "\n\\fourch{%s}{%s}{%s}{%s}\n"%(matchobj.group(1).strip(),matchobj.group(2).strip(),matchobj.group(3).strip(),matchobj.group(4).strip()),string) # 选择题的选择支处理 @@ -983,7 +984,10 @@ def RefineMathpix(raw_string): # 进一步修改mathpix得到的字符串 string = re.sub(r"(%[^\n]*)\\blank\{50\}\.",lambda matchobj:matchobj.group(1),string) #注释行不加\blank{50} string = re.sub(r"[\\\\]*\n(\(\d{1,2}\))(?:(?!\n)\s)*",lambda matchobj: "\\\\\n"+matchobj.group(1)+" ",string) #新一行的小题号回车前加上换行符 string = re.sub(r"\(([^\(\)]*(?:\\in |=|\\ge|\\le|>|<)[^\(\)]*)\)\$",lambda matchobj: "$($" + matchobj.group(1) + "$)",string) #公式最后的范围陈述的括号放到公式环境外 + string = SplitMathComma(string) #判断数学环境中的","是否需要用$ $分离, 如果需要则执行分离 + string = MergeMathComma(string) #判断非数学环境中的","是否需要合并在一个数学环境中, 如果需要则执行合并 string = RefineChineseComma(string) #改顿号 + return string def RefineChineseComma(string): #顿号如果在数学环境中, 则在两侧加上$符号 @@ -993,7 +997,74 @@ def RefineChineseComma(string): #顿号如果在数学环境中, 则在两侧加 if string[:pos].count("$") % 2 == 1: string = string[:pos].rstrip() + "$、$" + string[(pos+1):].lstrip() return string - + +def SplitMathComma(string): #判断数学环境中的","是否需要用$ $分离, 如果需要则执行分离 + UnsplittedPairs = [(r"[\(\[]",r"[\)\]]"),(r"\{",r"\}"),(r"\\begin\{cases\}",r"\\end\{cases\}")] + lmatter,rmatter = (string,"") + while "," in lmatter: + pos = lmatter.rfind(",") + if lmatter[:pos].count("$") % 2 == 1: + stringtemp = lmatter+rmatter + start = stringtemp[:pos].rfind("$") + end = stringtemp[pos:].find("$")+pos #寻找,两侧的数学环境 stringtemp[start:end+1] + locallmatter = stringtemp[start+1:pos] + localrmatter = stringtemp[pos+1:end] + tosplit = True + for p1,p2 in UnsplittedPairs: + p1l = SubstringOccurence(p1,locallmatter) + p1r = SubstringOccurence(p1,localrmatter) + p2l = SubstringOccurence(p2,locallmatter) + p2r = SubstringOccurence(p2,localrmatter) + if len(p1l)>0 and len(p2l) == 0: + tosplit = False + if len(p2r)>0 and len(p1r) == 0: + tosplit = False + if len(p1l)*len(p2l)>0: + if p1l[-1]>p2l[-1]: + tosplit = False + if len(p1r)*len(p2r)>0: + if p1r[0]>p2r[0]: + tosplit = False + if len(SubstringOccurence(r"(?:=|\\ge|\\le|\\in|>|<)",locallmatter))*len(SubstringOccurence(r"(?:=|\\ge|\\le|\\in|>|<)",localrmatter)) == 0: + tosplit = False + if tosplit: + rmatter = ", $"+lmatter[pos+1:].lstrip()+rmatter.lstrip() + lmatter = lmatter[:pos]+"$" + else: + rmatter = lmatter[pos-1:]+rmatter + lmatter = lmatter[:pos-1] + else: + rmatter = lmatter[pos-1:]+rmatter + lmatter = lmatter[:pos-1] + return lmatter+rmatter + +def MergeMathComma(string): #判断非数学环境中的","是否需要合并在一个数学环境中, 如果需要则执行合并 + UnsplittedPairs = [(r"[\(\[]",r"[\)\]]"),(r"\{",r"\}"),(r"\\begin\{cases\}",r"\\end\{cases\}")] + lmatter,rmatter = (string,"") + while not [item for item in re.finditer(r"\$,\s*\$",lmatter)] == []: + CommaPos = [(item.start(),item.end()) for item in re.finditer(r"\$,\s*\$",lmatter)] + ThePos = CommaPos[-1] + tempstring = lmatter+rmatter + lpos = tempstring[:ThePos[0]].rfind("$") + rpos = tempstring[ThePos[1]+1:].find("$") + ThePos[1] + 1 + locallmatter = tempstring[lpos+1:ThePos[0]] + localrmatter = tempstring[ThePos[1]:rpos] + tomerge = False + for p1,p2 in UnsplittedPairs: + if len(SubstringOccurence(p1,locallmatter))-len(SubstringOccurence(p2,locallmatter)) == 1 and len(SubstringOccurence(p2,localrmatter))-len(SubstringOccurence(p1,localrmatter)) == 1: + tomerge = True + if tomerge: + rmatter = ", "+lmatter[ThePos[1]+1:]+rmatter + lmatter = lmatter[:ThePos[0]] + else: + rmatter = lmatter[ThePos[0]:]+rmatter + lmatter = lmatter[:ThePos[0]] + return lmatter+rmatter + + +def SubstringOccurence(regex,string): #生成regex在string中出现的所有位置 + poslist = [item.start() for item in re.finditer(regex,string)] + return poslist if __name__ == "__main__": print("数据库工具, import用.") \ No newline at end of file