import os,re,difflib,Levenshtein,time,json # 重要!!! 新旧题目的范围(有重复默认为新题) id_new_problems = "1:50000" id_old_problems = "1:50000" threshold = 0.9999 #生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间 def generate_number_set(string): string = re.sub(r"[\n\s]","",string) string_list = string.split(",") numbers_list = [] for s in string_list: if not ":" in s: numbers_list.append(s.zfill(6)) else: start,end = s.split(":") for ind in range(int(start),int(end)+1): numbers_list.append(str(ind).zfill(6)) return numbers_list #字符串预处理 def pre_treating(string): string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string) string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string) string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string) string = re.sub(r"[\n\t]","",string) string = re.sub(r"(displaystyle)|(overrightarrow)","",string) string = re.sub(r"[,\.:;?]","",string) return string #difflab字符串比较 def difflab_get_equal_rate(str1, str2): # str1 = pre_treating(str1) # str2 = pre_treating(str2) return difflib.SequenceMatcher(None, str1, str2).ratio() #Levenshtein jaro字符串比较 def jaro_get_equal_rate(str1,str2): # str1 = pre_treating(str1) # str2 = pre_treating(str2) return Levenshtein.jaro(str1,str2) #Levenshtein 字符串比较 def Lev_get_equal_rate(str1,str2): # str1 = pre_treating(str1) # str2 = pre_treating(str2) return Levenshtein.ratio(str1,str2) #指定对比方法 sim_test = jaro_get_equal_rate #读入题库 with open(r"../题库0.3/Problems.json","r",encoding = "utf8") as f: database = f.read() pro_dict = json.loads(database) #生成旧题目数据库字典与新题目数据库字典 new_id_list_raw = generate_number_set(id_new_problems) new_id_list = [id for id in pro_dict if id in new_id_list_raw] old_id_list_raw = generate_number_set(id_old_problems) old_id_list = [id for id in pro_dict if (id in old_id_list_raw and not id in new_id_list_raw)] old_problems_dict = {} new_problems_dict = {} old_problems_dict_content = {} new_problems_dict_content = {} for id in new_id_list: new_problems_dict[id] = pro_dict[id] new_problems_dict_content[id] = pre_treating(pro_dict[id]["content"]) for id in old_id_list: old_problems_dict[id] = pro_dict[id] old_problems_dict_content[id] = pre_treating(pro_dict[id]["content"]) print("旧题目数:",len(old_problems_dict),", 新题目数:",len(new_problems_dict)) #记录起始时间 start_time = time.time() suspect_count = 0 remarked = 0 alike_problems = "" #开始新题与旧题的比对 count = 0 print("开始新题与旧题的比对") for id_new in new_problems_dict: count += 1 if count % 50 == 0: print(count) for id_old in old_problems_dict: similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old]) if similar_rate > threshold or id_new in old_problems_dict[id_old]["related"] or id_new in old_problems_dict[id_old]["same"] or id_old in new_problems_dict[id_new]["related"] or id_old in new_problems_dict[id_new]["same"]: suspect_count += 1 if not (id_new in old_problems_dict[id_old]["related"] or id_new in old_problems_dict[id_old]["same"] or id_new in old_problems_dict[id_old]["unrelated"] or id_old in new_problems_dict[id_new]["related"] or id_old in new_problems_dict[id_new]["same"] or id_old in new_problems_dict[id_new]["unrelated"]): alike_problems += ("%.4f" %similar_rate) + "\n\n" + id_new + " " + new_problems_dict[id_new]["content"] + "\n\n" + id_old + " " + old_problems_dict[id_old]["content"] + "\n\n" else: remarked += 1 #开始新题之间的比对 count = 0 print("开始新题之间的比对") while len(new_problems_dict) >= 2: count += 1 if count % 50 == 0: print(count) keys = list(new_problems_dict.keys()) current_problem = new_problems_dict.pop(keys[0]) current_problem_content = new_problems_dict_content[current_problem["id"]] for id_new in new_problems_dict: similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content) if similar_rate > threshold or id_new in current_problem["related"] or id_new in current_problem["same"] or current_problem["id"] in new_problems_dict[id_new]["related"] or current_problem["id"] in new_problems_dict[id_new]["same"]: suspect_count += 1 if not (id_new in current_problem["related"] or id_new in current_problem["same"] or id_new in current_problem["unrelated"] or current_problem["id"] in new_problems_dict[id_new]["related"] or current_problem["id"] in new_problems_dict[id_new]["same"] or current_problem["id"] in new_problems_dict[id_new]["unrelated"]): alike_problems += ("%.4f" %similar_rate) + "\n\n" + id_new + " " + new_problems_dict[id_new]["content"] + "\n\n" + current_problem["id"] + " " + current_problem["content"] + "\n\n" else: remarked += 1 #记录终止时间及显示结果 end_time = time.time() print("总耗时:",end_time-start_time,"秒.") print("发现相似: ",suspect_count,", 其中已标注: ",remarked,".") with open("临时文件/相似题目.txt","w",encoding="utf8") as f: f.write(alike_problems)