import os,re import win32clipboard as wc import win32con # 获取剪切板内容 def getCopy(): wc.OpenClipboard() t = wc.GetClipboardData(win32con.CF_UNICODETEXT) wc.CloseClipboard() return t # 写入剪切板内容 def setCopy(str): wc.OpenClipboard() wc.EmptyClipboard() wc.SetClipboardData(win32con.CF_UNICODETEXT, str) wc.CloseClipboard() def full_stop(matchobj): if matchobj.group(1) == "。" or matchobj.group(1) == ".": return ". " else: return ".\n" def refine_brackets(matchobj): return matchobj.group(1)[1:-1] def insert_a_blank(matchobj): return matchobj.group(1)[:-1]+" "+matchobj.group(1)[-1] def multiple_choice(matchobj): string = "\\fourch" + "{" + matchobj.group(1) + "}{" + matchobj.group(2) + "}{" + matchobj.group(3) + "}{" + matchobj.group(4) + "}\n" return string def boldsymbols(matchobj): return "\\i"+matchobj.group(1)[:-1]+"\\mathbf{"+matchobj.group(1)[-1]+"}" def boldsymbols_star(matchobj): return "\\in \\mathbf{"+matchobj.group(1)+"}^*" def singleboldsymbols(matchobj): return "$\\mathbf{" + matchobj.group(1) + "}$" def blackboardbold(matchobj): string = "\\mathbf" + "{" + matchobj.group(1) + "}" return string def limit(matchobj): return "\\displaystyle\\lim_{"+matchobj.group(1)+"}" def replace_i(matchobj): string = matchobj.group(1) length = len(string) for i in range(length-1,-1,-1): if string[i] == "i" and not "item" in string[i:] and not "overline" in string[i:]: string = string[:i] + "\\mathrm{i}" + string[i+1:] return string def refine_log(matchobj): return r"\log_"+matchobj.group(1) def refine_powers(matchobj): base = matchobj.group(1) power = matchobj.group(2) return base + "^" + power def refine_sequences(matchobj): return "\{" + matchobj.group(1) + "\}" def refine_starting_brackets(matchobj): return "$" + matchobj.group(1) def refine_left_operating_brackets(matchobj): obj = matchobj.group(2) return matchobj.group(1)+obj def refine_right_operating_brackets(matchobj): obj = matchobj.group(1) return obj + matchobj.group(2) def refine_brackets_in_brackets(matchobj): return matchobj.group(1) + matchobj.group(2) + matchobj.group(3) def mathbf(matchobj): return "\\mathbf{" + matchobj.group(1) + "}^" + matchobj.group(2) #以上是202207之前的文本处理机制 global layer def rename_bracket(matchobj): return "leftbracket" + str(layer) + matchobj.group(1) + "rightbracket" + str(layer) def frac_brackets(matchobj): return "frac{"+matchobj.group(1)+"}{"+matchobj.group(2)+"}" def frac_single_second_bracket(matchobj): return "frac "+matchobj.group(1)+"{"+matchobj.group(2)+"}" def recall_vital_bracket(matchobj): return matchobj.group(1) + "{" + matchobj.group(2) + "}" def sqrt_brackets(matchobj): if matchobj.group(1) == None: first_group = "" else: first_group = matchobj.group(1) return "sqrt "+ first_group +"{" + matchobj.group(2) + "}" #def refine_frac(string): # for s in range(7): # for t in range(7): # string = re.sub(r"frac[\s]*leftbracket"+str(s)+"(.*?)"+r"rightbracket"+str(s)+"[\s]*"+r"leftbracket"+str(t)+"(.*?)"+r"rightbracket"+str(t),frac_brackets,string) # return string def refine_single_second_frac(string): for s in range(7): string = re.sub(r"frac[\s]*(\w)[\s]*leftbracket"+str(s)+"(.*?)"+r"rightbracket"+str(s),frac_single_second_bracket,string) return string def refine_vital_bracket(string): for s in range(7): string = re.sub(r"(frac)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string) string = re.sub(r"(line)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string) string = re.sub(r"(arrow)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string) string = re.sub(r"(_)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string) string = re.sub(r"(\^)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string) string = re.sub(r"(mathrm)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string) string = re.sub(r"(mathbf)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string) string = re.sub(r"(begin)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string) string = re.sub(r"(end)[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),recall_vital_bracket,string) return string def refine_sqrt(string): for s in range(7): string = re.sub(r"sqrt[\s]*(\[\w*\])*[\s]*leftbracket"+str(s)+"(.*?)rightbracket"+str(s),sqrt_brackets,string) return string def give_blanks(string): string = re.sub(r"(sqrt[\w])",insert_a_blank,string) string = re.sub(r"(frac[\w])",insert_a_blank,string) return string def give_brackets(string): string = re.sub(r"leftbracket\d","",string) string = re.sub(r"rightbracket\d","",string) string = re.sub(r"leftset",r"\{",string) string = re.sub(r"rightset",r"\}",string) return string #以上是20220715新加的文本处理机制 def initial_bracket_search(string): t = re.search(r"^[\s]*?leftbracket(\d)",string) if t == None: return -1 else: return t.span()[1] def initial_brackets_pair_search(string,d): t = re.search("rightbracket"+d,string) if t == None: return -1 else: return t.span()[1] def refine_frac(string): eq_left = "" eq_right = string while re.search("frac",eq_right) != None: pos = re.search("frac",eq_right) eq_left += eq_right[:pos.span()[1]] eq_right = eq_right[pos.span()[1]:] if initial_bracket_search(eq_right)>0: pos = initial_brackets_pair_search(eq_right,eq_right[initial_bracket_search(eq_right)-1]) first_bracket = eq_right[:pos] first_layer = first_bracket[-1] eq_remain = eq_right[pos:] if initial_bracket_search(eq_remain)>0: pos = initial_brackets_pair_search(eq_remain,eq_remain[initial_bracket_search(eq_remain)-1]) second_bracket = eq_remain[:pos] second_layer = second_bracket[-1] first_bracket = re.sub(r"leftbracket"+first_layer,"{",first_bracket) second_bracket = re.sub(r"leftbracket"+second_layer,"{",second_bracket) first_bracket = re.sub(r"rightbracket"+first_layer,"}",first_bracket) second_bracket = re.sub(r"rightbracket"+second_layer,"}",second_bracket) eq_right = first_bracket+second_bracket+eq_remain[pos:] return eq_left+eq_right #以上是20220718修改的大括号处理机制, 修复了一个bug def reduce_blank(matchobj): return matchobj.group(1).replace(" ","") def add_dollars(matchobj): return matchobj.group(1)[0] + r"$" + matchobj.group(1)[1:-1] + r"$" + matchobj.group(1)[-1] def del_first_char(matchobj): return matchobj.group(1)[1:] def add_underline(matchobj): return matchobj.group(1)[0] + "_" + matchobj.group(1)[-1] def brackets_to_cwords(matchobj): return "左括号"+matchobj.group(1)+"右括号" def cwords_to_brackets(matchobj): return "("+matchobj.group(1)+")" def circled_brackets(matchobj): return matchobj.group(1)[:-1]+"{"+matchobj.group(1)[-1] + "}" # try: # os.chdir(r"D:\mathdept\mathdept\文本处理程序等") # except: # os.chdir(r"D:\mathdept\文本处理程序等") # with open("textfile.txt", "r", encoding = "utf8") as textfile: # data = textfile.read() data = getCopy() #去除左右括号的前缀 data = data.replace(r"\rightarrow",r"\to") data = data.replace(r"\left.","").replace(r"\left","").replace(r"\right.","").replace(r"\right","") #全角半角符号替换 data = re.sub(" "," ",data) data = re.sub("(。[\n]*)",full_stop,data) data = re.sub("(.[\n]*)",full_stop,data) data = re.sub(",",", ",data) data = re.sub(":",": ",data) data = re.sub(";","; ",data) data = re.sub("(","(",data) data = re.sub(")",")",data) data = re.sub("?","? ",data) data = re.sub("“","``",data) data = re.sub("”","''",data) data = re.sub(" ``","``",data) data = re.sub("'' ","''",data) #替换题号 data = re.sub(r"例[\s]*","例",data) data = re.sub("(^[例]*[0-9]+[\s]*\.[\s]+)","\\n\\\\item ",data) data = re.sub("(^[例]*[0-9]+[\s]*、[\s]*)","\\n\\\\item ",data) data = re.sub("(\\n[例]*[0-9]+[\s]*\.[\s]+)","\\n\\\\item ",data) data = re.sub("(\\n[例]*[0-9]+[\s]*、[\s]*)","\\n\\\\item ",data) #公式标志换成$符号 data = re.sub("\\\\\[",r"$",data) data = re.sub("\\\\\]",r"$",data) data = re.sub("\$\$","",data) #标点和$符号分开 data = re.sub(r"([,.:;])\$",lambda x:x.group(1)+" $",data) #选择题替换成标准格式 data = re.sub("A\.([\s\S]*?)B\.([\s\S]*?)C\.([\s\S]*?)D\.([\s\S]*?)\\n",multiple_choice,data) data = re.sub("\(A\)([\s\S]*?)\(B\)([\s\S]*?)\(C\)([\s\S]*?)\(D\)([\s\S]*?)\\n",multiple_choice,data) data = re.sub("A\.([\s\S]*?)B\.([\s\S]*?)C\.([\s\S]*?)D\.([\s\S]*?)\\n",multiple_choice,data) data = re.sub("\(A\)([\s\S]*?)\(B\)([\s\S]*?)\(C\)([\s\S]*?)\(D\)([\s\S]*?)\\n",multiple_choice,data) data = re.sub("\$[ ]+\}","$}",data) data = re.sub("\{[ ]+\$","{$",data) #替换frac为dfrac data = data.replace("\\frac","\\dfrac") #替换多余的空行 for i in range(20): data = re.sub("\n[\t ]*\n","\n",data) #删除\quad data = re.sub(r"\\q+uad","",data) #删除~ data = re.sub(r"~","",data) data1 = data #替换后暂存data1 #分离文字和公式 raw_texts = [] #文字数组 raw_equations = [] #公式数组 d = data while len(d) > 0: interval = re.search(r"\$[\s\S]*?\$",d) if not interval == None: (start, end) = interval.span() raw_texts.append(d[:start]) raw_equations.append(d[start:end]) d = d[end:] else: raw_texts.append(d) d = "" #至此已经分离了文字和公式,公式在两个$之内,包含两个$ modified_texts = [] modified_equations = [] for text in raw_texts: text1 = text #删除选项中无用的空格 text1 = re.sub("\{[\s]+?","{",text1) text1 = re.sub("[\s]+?\}","}",text1) #填空题的处理 # text1 = re.sub("[ _]{2,}",r"\\blank{50}",text1) #选择题的处理 text1 = re.sub(r"\(\\blank\{50\}\)","\\\\bracket{20}",text1) text1 = re.sub(r"\([\s]{1,10}\)","\\\\bracket{20}",text1) #逗号后面加空格 text1 = re.sub(",[ ]*",", ",text1) text1 = re.sub(r"\.\}","}",text1) text1 = re.sub(r"\n\d{1,3}\.",r"\n\\item ",text1) # text1 = re.sub(r"\s{2,}\.",r"\\blank{50}.",text1) # text1 = re.sub(r"\s{2,}\,",r"\\blank{50},",text1) text1 = re.sub(r"\s*\\bracket\{20\}\s*\n",r"\\bracket{20}.\n",text1) #改非规范选择题 text1 = re.sub(r"[\.;]\}","}",text1) text1 = re.sub(r"([\u4e00-\u9fa5])[\s]+([\d]{1,6})[\s]+([\u4e00-\u9fa5])",lambda x:x.group(1)+"$"+x.group(2)+"$"+x.group(3),text1) modified_texts.append(text1) for equation in raw_equations: equation1 = equation #删除一些无效大括号 for i in range(3): equation1 = re.sub(r"_\{([0-9a-zA-Z])\}",lambda x:"_"+x.group(1),equation1) equation1 = re.sub(r"\^\{([0-9a-zA-Z])\}",lambda x:"^"+x.group(1),equation1) #合并一些公式中的无效空格 for i in range(2): equation1 = re.sub(r"([0-9A-Z])\s+([0-9A-Z])",lambda x:x.group(1)+x.group(2),equation1) #改变组合数和排列数 equation1 = re.sub(r"([CP])(_[^_\^]{,5}\^)",lambda x:r"\mathrm{"+x.group(1)+"}"+x.group(2),equation1) #改单位 equation1 = re.sub(r"mathrm\{cm\}","text{cm}",equation1) equation1 = re.sub(r"mathrm\{km\}","text{km}",equation1) #改cdots equation1 = re.sub(r"ldots","cdots",equation1) modified_equations.append(equation1) #整合修改过的文本和公式 modified_data = "" for i in range(len(modified_texts)): try: modified_data += modified_texts[i] except: a = 1 try: modified_data += modified_equations[i] except: a = 1 modified_data = re.sub(r"[ ]+\n","\n",modified_data) modified_data = re.sub(r"\$[\s]*?\\parallel[\s]*?\$",r"\\parallel",modified_data) modified_data = re.sub(r"\n例\s*?\d{1,3}\s*",r"\n\\item ",modified_data) modified_data = re.sub(r"(\$[\,\.:;]\$)",refine_brackets,modified_data) #以下是mathpix之后的空格去除 for i in range(3): modified_data = re.sub(r"([\u4e00-\u9fa5])( )([\u4e00-\u9fa5])",lambda x:x.group(1)+x.group(3),modified_data) modified_data = re.sub(r"\$ ","$",modified_data) modified_data = re.sub(r" \$","$",modified_data) #mathpix的错别字修改 modified_data = modified_data.replace("雉","锥") modified_data = re.sub("[粗秿]圆","椭圆",modified_data) modified_data = modified_data.replace("针角","钝角") modified_data = re.sub("投郑","投掷",modified_data) modified_data = re.sub("抛郑","抛掷",modified_data) modified_data = re.sub("范目","范围",modified_data) modified_data = re.sub("揷","插",modified_data) #mathpix的自由向量修改 modified_data = modified_data.replace(r"\vec",r"\overrightarrow ") modified_data = modified_data.replace(r"\bar",r"\overline ") #mathpix的极限修改 modified_data = re.sub(r"\\lim[\s]*_\{n \\to \\infty\}",r"\\displaystyle\\lim_{n\\to\\infty}",modified_data) #mathpix的顿号修改 modified_data = modified_data.replace(r" 、 ",r"$、$") #改slant等 modified_data = modified_data.replace(r"slant","") modified_data = modified_data.replace(r"\mid","|") modified_data = re.sub(r"\\mathrm\{\\mathrm\{i\}\}",r"\\mathrm{i}",modified_data) modified_data = modified_data.replace(",$",", $") modified_data = modified_data.replace(" / /",r"\parallel") modified_data = modified_data.replace("mathrmR",r"mathbf{R}") modified_data = modified_data.replace(r"^{\prime}","'") modified_data = re.sub(r"\^\{\\dfrac",r"^{\\frac",modified_data) modified_data = re.sub(r"\^\{-\\dfrac",r"^{-\\frac",modified_data) modified_data = re.sub(r"_\{\\dfrac",r"_{\\frac",modified_data) modified_data = re.sub(r"_\{-\\dfrac",r"_{-\\frac",modified_data) #改分段函数等 modified_data = re.sub(r"\\{\\begin\{array\}\{[rcl]*\}",r"\\begin{cases}",modified_data) modified_data = re.sub(r"\\end{array}",r"\\end{cases}",modified_data) #冒号后加空格 modified_data = re.sub(r":([\S])", lambda x:": "+x.group(1),modified_data) #识别填空题加空格 modified_data = re.sub(r"([\u4e00-\u9fa5\$])[\s]*\n\\item",lambda x: x.group(1)+"\\blank{50}.\n\\item",modified_data) #识别选择题加括号 modified_data = re.sub(r"\$\(\s*\)\$",r"\\bracket{20}",modified_data) modified_data = re.sub(r"([\u4e00-\u9fa5\$])[\s]*\n\\fourch",lambda x: x.group(1)+"\\bracket{20}.\n\\fourch",modified_data) #改圆弧 modified_data = re.sub(r"overparen",r"overset\\frown",modified_data) #改连续的两个$$ modified_data = re.sub(r"([\S])(\$\$)([\S])",lambda x: x.group(1)+x.group(3),modified_data) setCopy(modified_data) with open("临时文件/outputfile.txt","w",encoding = "utf8") as f: f.write(modified_data)