import os,re,difflib,Levenshtein,time,json # 重要!!! 范围 old_problems_range = "1:50000" threshold = 0.85 # 待比对的文件 filename = r"C:\Users\weiye\Documents\wwy sync\临时工作区\自拟题目14.tex" #生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间 def generate_number_set(string): string = re.sub(r"[\n\s]","",string) string_list = string.split(",") numbers_list = [] for s in string_list: if not ":" in s: numbers_list.append(s.zfill(6)) else: start,end = s.split(":") for ind in range(int(start),int(end)+1): numbers_list.append(str(ind).zfill(6)) return numbers_list #字符串预处理 def pre_treating(string): string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string) string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)|(mathrm)|(text)","",string) string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string) string = re.sub(r"[\n\t]","",string) string = re.sub(r"(displaystyle)|(overrightarrow)","",string) string = re.sub(r"[,\.:;?]","",string) return string #difflab字符串比较 def difflab_get_equal_rate(str1, str2): return difflib.SequenceMatcher(None, str1, str2).ratio() #Levenshtein jaro字符串比较 def jaro_get_equal_rate(str1,str2): return Levenshtein.jaro(str1,str2) #Levenshtein 字符串比较 def Lev_get_equal_rate(str1,str2): return Levenshtein.ratio(str1,str2) def GenerateProblemListFromString(problem_string): try: data = re.findall(r"\\begin\{document\}([\s\S]*?)\\end\{document\}",problem_string)[0] except: data = problem_string data = re.sub(r"\n{2,}","\n",data) data = re.sub(r"\\item",r"\\enditem\\item",data) data = re.sub(r"\\end\{enumerate\}",r"\\enditem",data) ProblemList_raw = [p.strip() for p in re.findall(r"\\item([\s\S]*?)\\enditem",data)] ProblemsList = [] for p in ProblemList_raw: startpos = data.index(p) tempdata = data[:startpos] suflist = re.findall(r"\n\%[\dA-Za-z]+",tempdata) if len(suflist) > 0: suffix = suflist[-1].replace("%","").strip() else: suffix = "" ProblemsList.append((p,suffix)) return ProblemsList #指定对比方法 sim_test = jaro_get_equal_rate #读入题库 with open(r"../题库0.3/Problems.json","r",encoding = "utf8") as f: database = f.read() pro_dict = json.loads(database) output = "" with open(filename,"r",encoding="u8") as f: newdatabase = f.read() new_pro_list = GenerateProblemListFromString(newdatabase) pro_dict_treated = {} idrange_raw = generate_number_set(old_problems_range) idrange = [id for id in pro_dict if id in idrange_raw] for p in idrange: pro_dict_treated[p] = pre_treating(pro_dict[p]["content"]) new_dict_treated = {} for i in range(len(new_pro_list)): new_dict_treated[i+1] = pre_treating(new_pro_list[i][0]) for i in new_dict_treated: new_p = new_dict_treated[i] maxsim = 0 for p in pro_dict_treated: old_p = pro_dict_treated[p] sim = sim_test(new_p,old_p) if sim > maxsim: maxsim = sim argmax = p print("%.3f\t%d\t%s" %(maxsim,i,argmax)) output += ("%.3f\t%d\t%s" %(maxsim,i,argmax)) + "\n" # print("\n新题: %s" %new_pro_list[i-1][0]) # print("\n原题: %s\n\n\n" %pro_dict[]["content"]) with open("临时文件/新题相似相同.txt","w",encoding = "u8") as f: f.write(output)