import os,re,difflib,Levenshtein,time,json # 相同题目的阈值 threshold = 0.99 outputfile = r"临时文件/相同题目列表.txt" #生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间 def generate_number_set(string): string = re.sub(r"[\n\s]","",string) string_list = string.split(",") numbers_list = [] for s in string_list: if not ":" in s: numbers_list.append(s.zfill(6)) else: start,end = s.split(":") for ind in range(int(start),int(end)+1): numbers_list.append(str(ind).zfill(6)) return numbers_list #字符串预处理 def pre_treating(string): string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string) string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string) string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string) string = re.sub(r"[\n\t]","",string) string = re.sub(r"(displaystyle)|(overrightarrow)","",string) string = re.sub(r"[,\.:;?]","",string) return string #difflab字符串比较 def difflab_get_equal_rate(str1, str2): # str1 = pre_treating(str1) # str2 = pre_treating(str2) return difflib.SequenceMatcher(None, str1, str2).ratio() #Levenshtein jaro字符串比较 def jaro_get_equal_rate(str1,str2): # str1 = pre_treating(str1) # str2 = pre_treating(str2) return Levenshtein.jaro(str1,str2) #Levenshtein 字符串比较 def Lev_get_equal_rate(str1,str2): # str1 = pre_treating(str1) # str2 = pre_treating(str2) return Levenshtein.ratio(str1,str2) #指定对比方法 sim_test = jaro_get_equal_rate #读入题库 with open(r"../题库0.3/Problems.json","r",encoding = "utf8") as f: database = f.read() pro_dict = json.loads(database) pro_dict_treated = {} for id in pro_dict: pro_dict_treated[id] = pro_dict[id].copy() pro_dict_treated[id]["content"] = pre_treating(pro_dict_treated[id]["content"]) print("题目数:",len(pro_dict)) #记录起始时间 starttime = time.time() alike_problems = "" count = 0 keys = list(pro_dict_treated.keys()) while len(keys) >= 2: count += 1 if count % 500 == 0: print(count) currentid = keys.pop(0) content1 = pro_dict_treated[currentid]["content"] same = [] for id in keys: if not id in pro_dict[currentid]["same"] and not id in pro_dict[currentid]["related"]: content2 = pro_dict_treated[id]["content"] if sim_test(content1,content2)>threshold: same.append(id) if len(same) >= 1: # print(currentid) alike_problems += currentid + "," for i in same: # print(i) keys.pop(keys.index(i)) alike_problems += ",".join(same) alike_problems += "\n\n" endtime = time.time() print("耗时: %.3f秒" %(endtime-starttime)) with open(outputfile,"w",encoding = "u8") as f: f.write(alike_problems)