374 lines
15 KiB
Python
374 lines
15 KiB
Python
import os,re
|
||
import win32clipboard as wc
|
||
import win32con
|
||
|
||
# 获取剪切板内容
|
||
def getCopy():
|
||
wc.OpenClipboard()
|
||
t = wc.GetClipboardData(win32con.CF_UNICODETEXT)
|
||
wc.CloseClipboard()
|
||
return t
|
||
|
||
# 写入剪切板内容
|
||
def setCopy(str):
|
||
wc.OpenClipboard()
|
||
wc.EmptyClipboard()
|
||
wc.SetClipboardData(win32con.CF_UNICODETEXT, str)
|
||
wc.CloseClipboard()
|
||
|
||
def full_stop(matchobj):
|
||
if matchobj.group(1) == "。" or matchobj.group(1) == ".":
|
||
return ". "
|
||
else:
|
||
return ".\n"
|
||
def refine_brackets(matchobj):
|
||
return matchobj.group(1)[1:-1]
|
||
def insert_a_blank(matchobj):
|
||
return matchobj.group(1)[:-1]+" "+matchobj.group(1)[-1]
|
||
def multiple_choice(matchobj):
|
||
string = "\\fourch" + "{" + matchobj.group(1) + "}{" + matchobj.group(2) + "}{" + matchobj.group(3) + "}{" + matchobj.group(4) + "}\n"
|
||
return string
|
||
def boldsymbols(matchobj):
|
||
return "\\i"+matchobj.group(1)[:-1]+"\\mathbf{"+matchobj.group(1)[-1]+"}"
|
||
def boldsymbols_star(matchobj):
|
||
return "\\in \\mathbf{"+matchobj.group(1)+"}^*"
|
||
def singleboldsymbols(matchobj):
|
||
return "$\\mathbf{" + matchobj.group(1) + "}$"
|
||
def blackboardbold(matchobj):
|
||
string = "\\mathbf" + "{" + matchobj.group(1) + "}"
|
||
return string
|
||
def limit(matchobj):
|
||
return "\\displaystyle\\lim_{"+matchobj.group(1)+"}"
|
||
def replace_i(matchobj):
|
||
string = matchobj.group(1)
|
||
length = len(string)
|
||
for i in range(length-1,-1,-1):
|
||
if string[i] == "i" and not "item" in string[i:] and not "overline" in string[i:]:
|
||
string = string[:i] + "\\mathrm{i}" + string[i+1:]
|
||
return string
|
||
def refine_log(matchobj):
|
||
return r"\log_"+matchobj.group(1)
|
||
def refine_powers(matchobj):
|
||
base = matchobj.group(1)
|
||
power = matchobj.group(2)
|
||
return base + "^" + power
|
||
def refine_sequences(matchobj):
|
||
return "\{" + matchobj.group(1) + "\}"
|
||
def refine_starting_brackets(matchobj):
|
||
return "$" + matchobj.group(1)
|
||
def refine_left_operating_brackets(matchobj):
|
||
obj = matchobj.group(2)
|
||
return matchobj.group(1)+obj
|
||
def refine_right_operating_brackets(matchobj):
|
||
obj = matchobj.group(1)
|
||
return obj + matchobj.group(2)
|
||
def refine_brackets_in_brackets(matchobj):
|
||
return matchobj.group(1) + matchobj.group(2) + matchobj.group(3)
|
||
def mathbf(matchobj):
|
||
return "\\mathbf{" + matchobj.group(1) + "}^" + matchobj.group(2)
|
||
#以上是202207之前的文本处理机制
|
||
global layer
|
||
def rename_bracket(matchobj):
|
||
return "leftbracket" + str(layer) + matchobj.group(1) + "rightbracket" + str(layer)
|
||
def frac_brackets(matchobj):
|
||
return "frac{"+matchobj.group(1)+"}{"+matchobj.group(2)+"}"
|
||
def frac_single_second_bracket(matchobj):
|
||
return "frac "+matchobj.group(1)+"{"+matchobj.group(2)+"}"
|
||
def recall_vital_bracket(matchobj):
|
||
return matchobj.group(1) + "{" + matchobj.group(2) + "}"
|
||
def sqrt_brackets(matchobj):
|
||
if matchobj.group(1) == None:
|
||
first_group = ""
|
||
else:
|
||
first_group = matchobj.group(1)
|
||
return "sqrt "+ first_group +"{" + matchobj.group(2) + "}"
|
||
#def refine_frac(string):
|
||
# for s in range(7):
|
||
# for t in range(7):
|
||
# string = re.sub(r"frac[\s]*leftbracket"+str(s)+"(.*?)"+r"rightbracket"+str(s)+"[\s]*"+r"leftbracket"+str(t)+"(.*?)"+r"rightbracket"+str(t),frac_brackets,string)
|
||
# return string
|
||
def refine_single_second_frac(string):
|
||
for s in range(7):
|
||
string = re.sub(r"frac[\s]*(\w)[\s]*leftbracket"+str(s)+"(.*?)"+r"rightbracket"+str(s),frac_single_second_bracket,string)
|
||
return string
|
||
def refine_vital_bracket(string):
|
||
for s in range(7):
|
||
string = re.sub(r"(frac)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string)
|
||
string = re.sub(r"(line)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string)
|
||
string = re.sub(r"(arrow)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string)
|
||
string = re.sub(r"(_)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string)
|
||
string = re.sub(r"(\^)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string)
|
||
string = re.sub(r"(mathrm)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string)
|
||
string = re.sub(r"(mathbf)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string)
|
||
string = re.sub(r"(begin)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string)
|
||
string = re.sub(r"(end)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string)
|
||
return string
|
||
def refine_sqrt(string):
|
||
for s in range(7):
|
||
string = re.sub(r"sqrt[\s]*(\[\w*\])*[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),sqrt_brackets,string)
|
||
return string
|
||
def give_blanks(string):
|
||
string = re.sub(r"(sqrt[\w])",insert_a_blank,string)
|
||
string = re.sub(r"(frac[\w])",insert_a_blank,string)
|
||
return string
|
||
def give_brackets(string):
|
||
string = re.sub(r"leftbracket\d","",string)
|
||
string = re.sub(r"rightbracket\d","",string)
|
||
string = re.sub(r"leftset",r"\{",string)
|
||
string = re.sub(r"rightset",r"\}",string)
|
||
return string
|
||
#以上是20220715新加的文本处理机制
|
||
def initial_bracket_search(string):
|
||
t = re.search(r"^[\s]*?leftbracket(\d)",string)
|
||
if t == None:
|
||
return -1
|
||
else:
|
||
return t.span()[1]
|
||
def initial_brackets_pair_search(string,d):
|
||
t = re.search("rightbracket"+d,string)
|
||
if t == None:
|
||
return -1
|
||
else:
|
||
return t.span()[1]
|
||
def refine_frac(string):
|
||
eq_left = ""
|
||
eq_right = string
|
||
while re.search("frac",eq_right) != None:
|
||
pos = re.search("frac",eq_right)
|
||
eq_left += eq_right[:pos.span()[1]]
|
||
eq_right = eq_right[pos.span()[1]:]
|
||
if initial_bracket_search(eq_right)>0:
|
||
pos = initial_brackets_pair_search(eq_right,eq_right[initial_bracket_search(eq_right)-1])
|
||
first_bracket = eq_right[:pos]
|
||
first_layer = first_bracket[-1]
|
||
eq_remain = eq_right[pos:]
|
||
if initial_bracket_search(eq_remain)>0:
|
||
pos = initial_brackets_pair_search(eq_remain,eq_remain[initial_bracket_search(eq_remain)-1])
|
||
second_bracket = eq_remain[:pos]
|
||
second_layer = second_bracket[-1]
|
||
first_bracket = re.sub(r"leftbracket"+first_layer,"{",first_bracket)
|
||
second_bracket = re.sub(r"leftbracket"+second_layer,"{",second_bracket)
|
||
first_bracket = re.sub(r"rightbracket"+first_layer,"}",first_bracket)
|
||
second_bracket = re.sub(r"rightbracket"+second_layer,"}",second_bracket)
|
||
eq_right = first_bracket+second_bracket+eq_remain[pos:]
|
||
return eq_left+eq_right
|
||
#以上是20220718修改的大括号处理机制, 修复了一个bug
|
||
def reduce_blank(matchobj):
|
||
return matchobj.group(1).replace(" ","")
|
||
def add_dollars(matchobj):
|
||
return matchobj.group(1)[0] + r"$" + matchobj.group(1)[1:-1] + r"$" + matchobj.group(1)[-1]
|
||
def del_first_char(matchobj):
|
||
return matchobj.group(1)[1:]
|
||
def add_underline(matchobj):
|
||
return matchobj.group(1)[0] + "_" + matchobj.group(1)[-1]
|
||
def brackets_to_cwords(matchobj):
|
||
return "左括号"+matchobj.group(1)+"右括号"
|
||
def cwords_to_brackets(matchobj):
|
||
return "("+matchobj.group(1)+")"
|
||
def circled_brackets(matchobj):
|
||
return matchobj.group(1)[:-1]+"{"+matchobj.group(1)[-1] + "}"
|
||
|
||
# try:
|
||
# os.chdir(r"D:\mathdept\mathdept\文本处理程序等")
|
||
# except:
|
||
# os.chdir(r"D:\mathdept\文本处理程序等")
|
||
# with open("textfile.txt", "r", encoding = "utf8") as textfile:
|
||
# data = textfile.read()
|
||
|
||
data = getCopy()
|
||
|
||
|
||
#去除左右括号的前缀
|
||
data = data.replace(r"\rightarrow",r"\to")
|
||
data = data.replace(r"\left.","").replace(r"\left","").replace(r"\right.","").replace(r"\right","")
|
||
|
||
#全角半角符号替换
|
||
data = re.sub(" "," ",data)
|
||
data = re.sub("(。[\n]*)",full_stop,data)
|
||
data = re.sub("(.[\n]*)",full_stop,data)
|
||
data = re.sub(",",", ",data)
|
||
data = re.sub(":",": ",data)
|
||
data = re.sub(";","; ",data)
|
||
data = re.sub("(","(",data)
|
||
data = re.sub(")",")",data)
|
||
data = re.sub("?","? ",data)
|
||
data = re.sub("“","``",data)
|
||
data = re.sub("”","''",data)
|
||
data = re.sub(" ``","``",data)
|
||
data = re.sub("'' ","''",data)
|
||
|
||
#替换题号
|
||
data = re.sub(r"例[\s]*","例",data)
|
||
data = re.sub("(^[例]*[0-9]+[\s]*\.[\s]+)","\\n\\\\item ",data)
|
||
data = re.sub("(^[例]*[0-9]+[\s]*、[\s]*)","\\n\\\\item ",data)
|
||
data = re.sub("(\\n[例]*[0-9]+[\s]*\.[\s]+)","\\n\\\\item ",data)
|
||
data = re.sub("(\\n[例]*[0-9]+[\s]*、[\s]*)","\\n\\\\item ",data)
|
||
|
||
#公式标志换成$符号
|
||
data = re.sub("\\\\\[",r"$",data)
|
||
data = re.sub("\\\\\]",r"$",data)
|
||
data = re.sub("\$\$","",data)
|
||
|
||
#标点和$符号分开
|
||
data = re.sub(r"([,.:;])\$",lambda x:x.group(1)+" $",data)
|
||
|
||
#选择题替换成标准格式
|
||
data = re.sub("A\.([\s\S]*?)B\.([\s\S]*?)C\.([\s\S]*?)D\.([\s\S]*?)\\n",multiple_choice,data)
|
||
data = re.sub("\(A\)([\s\S]*?)\(B\)([\s\S]*?)\(C\)([\s\S]*?)\(D\)([\s\S]*?)\\n",multiple_choice,data)
|
||
data = re.sub("A\.([\s\S]*?)B\.([\s\S]*?)C\.([\s\S]*?)D\.([\s\S]*?)\\n",multiple_choice,data)
|
||
data = re.sub("\(A\)([\s\S]*?)\(B\)([\s\S]*?)\(C\)([\s\S]*?)\(D\)([\s\S]*?)\\n",multiple_choice,data)
|
||
data = re.sub("\$[ ]+\}","$}",data)
|
||
data = re.sub("\{[ ]+\$","{$",data)
|
||
|
||
#替换frac为dfrac
|
||
data = data.replace("\\frac","\\dfrac")
|
||
|
||
#替换多余的空行
|
||
for i in range(20):
|
||
data = re.sub("\n[\t ]*\n","\n",data)
|
||
|
||
#删除\quad
|
||
data = re.sub(r"\\q+uad","",data)
|
||
|
||
#删除~
|
||
data = re.sub(r"~","",data)
|
||
|
||
|
||
|
||
data1 = data #替换后暂存data1
|
||
|
||
#分离文字和公式
|
||
raw_texts = [] #文字数组
|
||
raw_equations = [] #公式数组
|
||
d = data
|
||
while len(d) > 0:
|
||
interval = re.search(r"\$[\s\S]*?\$",d)
|
||
if not interval == None:
|
||
(start, end) = interval.span()
|
||
raw_texts.append(d[:start])
|
||
raw_equations.append(d[start:end])
|
||
d = d[end:]
|
||
else:
|
||
raw_texts.append(d)
|
||
d = ""
|
||
#至此已经分离了文字和公式,公式在两个$之内,包含两个$
|
||
|
||
modified_texts = []
|
||
modified_equations = []
|
||
|
||
for text in raw_texts:
|
||
text1 = text
|
||
#删除选项中无用的空格
|
||
text1 = re.sub("\{[\s]+?","{",text1)
|
||
text1 = re.sub("[\s]+?\}","}",text1)
|
||
#填空题的处理
|
||
# text1 = re.sub("[ _]{2,}",r"\\blank{50}",text1)
|
||
#选择题的处理
|
||
text1 = re.sub(r"\(\\blank\{50\}\)","\\\\bracket{20}",text1)
|
||
text1 = re.sub(r"\([\s]{1,10}\)","\\\\bracket{20}",text1)
|
||
#逗号后面加空格
|
||
text1 = re.sub(",[ ]*",", ",text1)
|
||
text1 = re.sub(r"\.\}","}",text1)
|
||
text1 = re.sub(r"\n\d{1,3}\.",r"\n\\item ",text1)
|
||
# text1 = re.sub(r"\s{2,}\.",r"\\blank{50}.",text1)
|
||
# text1 = re.sub(r"\s{2,}\,",r"\\blank{50},",text1)
|
||
text1 = re.sub(r"\s*\\bracket\{20\}\s*\n",r"\\bracket{20}.\n",text1)
|
||
#改非规范选择题
|
||
text1 = re.sub(r"[\.;]\}","}",text1)
|
||
text1 = re.sub(r"([\u4e00-\u9fa5])[\s]+([\d]{1,6})[\s]+([\u4e00-\u9fa5])",lambda x:x.group(1)+"$"+x.group(2)+"$"+x.group(3),text1)
|
||
modified_texts.append(text1)
|
||
|
||
|
||
for equation in raw_equations:
|
||
equation1 = equation
|
||
#删除一些无效大括号
|
||
for i in range(3):
|
||
equation1 = re.sub(r"_\{([0-9a-zA-Z])\}",lambda x:"_"+x.group(1),equation1)
|
||
equation1 = re.sub(r"\^\{([0-9a-zA-Z])\}",lambda x:"^"+x.group(1),equation1)
|
||
#合并一些公式中的无效空格
|
||
for i in range(2):
|
||
equation1 = re.sub(r"([0-9A-Z])\s+([0-9A-Z])",lambda x:x.group(1)+x.group(2),equation1)
|
||
#改变组合数和排列数
|
||
equation1 = re.sub(r"([CP])(_[^_\^]{,5}\^)",lambda x:r"\mathrm{"+x.group(1)+"}"+x.group(2),equation1)
|
||
#改单位
|
||
equation1 = re.sub(r"mathrm\{cm\}","text{cm}",equation1)
|
||
equation1 = re.sub(r"mathrm\{km\}","text{km}",equation1)
|
||
#改cdots
|
||
equation1 = re.sub(r"ldots","cdots",equation1)
|
||
modified_equations.append(equation1)
|
||
|
||
|
||
#整合修改过的文本和公式
|
||
modified_data = ""
|
||
for i in range(len(modified_texts)):
|
||
try:
|
||
modified_data += modified_texts[i]
|
||
except:
|
||
a = 1
|
||
try:
|
||
modified_data += modified_equations[i]
|
||
except:
|
||
a = 1
|
||
modified_data = re.sub(r"[ ]+\n","\n",modified_data)
|
||
modified_data = re.sub(r"\$[\s]*?\\parallel[\s]*?\$",r"\\parallel",modified_data)
|
||
modified_data = re.sub(r"\n例\s*?\d{1,3}\s*",r"\n\\item ",modified_data)
|
||
modified_data = re.sub(r"(\$[\,\.:;]\$)",refine_brackets,modified_data)
|
||
|
||
|
||
#以下是mathpix之后的空格去除
|
||
for i in range(3):
|
||
modified_data = re.sub(r"([\u4e00-\u9fa5])( )([\u4e00-\u9fa5])",lambda x:x.group(1)+x.group(3),modified_data)
|
||
modified_data = re.sub(r"\$ ","$",modified_data)
|
||
modified_data = re.sub(r" \$","$",modified_data)
|
||
#mathpix的错别字修改
|
||
modified_data = modified_data.replace("雉","锥")
|
||
modified_data = re.sub("[粗秿]圆","椭圆",modified_data)
|
||
modified_data = modified_data.replace("针角","钝角")
|
||
modified_data = re.sub("投郑","投掷",modified_data)
|
||
modified_data = re.sub("抛郑","抛掷",modified_data)
|
||
modified_data = re.sub("范目","范围",modified_data)
|
||
modified_data = re.sub("揷","插",modified_data)
|
||
#mathpix的自由向量修改
|
||
modified_data = modified_data.replace(r"\vec",r"\overrightarrow ")
|
||
modified_data = modified_data.replace(r"\bar",r"\overline ")
|
||
#mathpix的极限修改
|
||
modified_data = re.sub(r"\\lim[\s]*_\{n \\to \\infty\}",r"\\displaystyle\\lim_{n\\to\\infty}",modified_data)
|
||
#mathpix的顿号修改
|
||
modified_data = modified_data.replace(r" 、 ",r"$、$")
|
||
#改slant等
|
||
modified_data = modified_data.replace(r"slant","")
|
||
modified_data = modified_data.replace(r"\mid","|")
|
||
modified_data = re.sub(r"\\mathrm\{\\mathrm\{i\}\}",r"\\mathrm{i}",modified_data)
|
||
modified_data = modified_data.replace(",$",", $")
|
||
modified_data = modified_data.replace(" / /",r"\parallel")
|
||
modified_data = modified_data.replace("mathrmR",r"mathbf{R}")
|
||
modified_data = modified_data.replace(r"^{\prime}","'")
|
||
modified_data = re.sub(r"\^\{\\dfrac",r"^{\\frac",modified_data)
|
||
modified_data = re.sub(r"\^\{-\\dfrac",r"^{-\\frac",modified_data)
|
||
modified_data = re.sub(r"_\{\\dfrac",r"_{\\frac",modified_data)
|
||
modified_data = re.sub(r"_\{-\\dfrac",r"_{-\\frac",modified_data)
|
||
|
||
#改分段函数等
|
||
modified_data = re.sub(r"\\{\\begin\{array\}\{[rcl]*\}",r"\\begin{cases}",modified_data)
|
||
modified_data = re.sub(r"\\end{array}",r"\\end{cases}",modified_data)
|
||
|
||
#冒号后加空格
|
||
modified_data = re.sub(r":([\S])", lambda x:": "+x.group(1),modified_data)
|
||
|
||
#识别填空题加空格
|
||
modified_data = re.sub(r"([\u4e00-\u9fa5\$])[\s]*\n\\item",lambda x: x.group(1)+"\\blank{50}.\n\\item",modified_data)
|
||
|
||
#识别选择题加括号
|
||
modified_data = re.sub(r"\$\(\s*\)\$",r"\\bracket{20}",modified_data)
|
||
modified_data = re.sub(r"([\u4e00-\u9fa5\$])[\s]*\n\\fourch",lambda x: x.group(1)+"\\bracket{20}.\n\\fourch",modified_data)
|
||
|
||
#改圆弧
|
||
modified_data = re.sub(r"overparen",r"overset\\frown",modified_data)
|
||
|
||
#改连续的两个$$
|
||
modified_data = re.sub(r"([\S])(\$\$)([\S])",lambda x: x.group(1)+x.group(3),modified_data)
|
||
|
||
setCopy(modified_data)
|
||
|
||
with open("临时文件/outputfile.txt","w",encoding = "utf8") as f:
|
||
f.write(modified_data) |