129 lines
5.3 KiB
Python
129 lines
5.3 KiB
Python
import os,re,difflib,Levenshtein,time,json
|
|
|
|
# 重要!!! 新旧题目的范围(有重复默认为新题)
|
|
id_new_problems = "1:50000"
|
|
id_old_problems = "1:50000"
|
|
threshold = 0.9999
|
|
|
|
#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间
|
|
def generate_number_set(string):
|
|
string = re.sub(r"[\n\s]","",string)
|
|
string_list = string.split(",")
|
|
numbers_list = []
|
|
for s in string_list:
|
|
if not ":" in s:
|
|
numbers_list.append(s.zfill(6))
|
|
else:
|
|
start,end = s.split(":")
|
|
for ind in range(int(start),int(end)+1):
|
|
numbers_list.append(str(ind).zfill(6))
|
|
return numbers_list
|
|
|
|
#字符串预处理
|
|
def pre_treating(string):
|
|
string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string)
|
|
string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string)
|
|
string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string)
|
|
string = re.sub(r"[\n\t]","",string)
|
|
string = re.sub(r"(displaystyle)|(overrightarrow)","",string)
|
|
string = re.sub(r"[,\.:;?]","",string)
|
|
return string
|
|
|
|
#difflab字符串比较
|
|
def difflab_get_equal_rate(str1, str2):
|
|
# str1 = pre_treating(str1)
|
|
# str2 = pre_treating(str2)
|
|
return difflib.SequenceMatcher(None, str1, str2).ratio()
|
|
|
|
#Levenshtein jaro字符串比较
|
|
def jaro_get_equal_rate(str1,str2):
|
|
# str1 = pre_treating(str1)
|
|
# str2 = pre_treating(str2)
|
|
return Levenshtein.jaro(str1,str2)
|
|
|
|
#Levenshtein 字符串比较
|
|
def Lev_get_equal_rate(str1,str2):
|
|
# str1 = pre_treating(str1)
|
|
# str2 = pre_treating(str2)
|
|
return Levenshtein.ratio(str1,str2)
|
|
|
|
|
|
|
|
|
|
#指定对比方法
|
|
sim_test = jaro_get_equal_rate
|
|
|
|
#读入题库
|
|
with open(r"../题库0.3/Problems.json","r",encoding = "utf8") as f:
|
|
database = f.read()
|
|
pro_dict = json.loads(database)
|
|
|
|
#生成旧题目数据库字典与新题目数据库字典
|
|
new_id_list_raw = generate_number_set(id_new_problems)
|
|
new_id_list = [id for id in pro_dict if id in new_id_list_raw]
|
|
old_id_list_raw = generate_number_set(id_old_problems)
|
|
old_id_list = [id for id in pro_dict if (id in old_id_list_raw and not id in new_id_list_raw)]
|
|
old_problems_dict = {}
|
|
new_problems_dict = {}
|
|
old_problems_dict_content = {}
|
|
new_problems_dict_content = {}
|
|
for id in new_id_list:
|
|
new_problems_dict[id] = pro_dict[id]
|
|
new_problems_dict_content[id] = pre_treating(pro_dict[id]["content"])
|
|
for id in old_id_list:
|
|
old_problems_dict[id] = pro_dict[id]
|
|
old_problems_dict_content[id] = pre_treating(pro_dict[id]["content"])
|
|
print("旧题目数:",len(old_problems_dict),", 新题目数:",len(new_problems_dict))
|
|
|
|
#记录起始时间
|
|
start_time = time.time()
|
|
suspect_count = 0
|
|
remarked = 0
|
|
|
|
alike_problems = ""
|
|
|
|
|
|
|
|
#开始新题与旧题的比对
|
|
count = 0
|
|
print("开始新题与旧题的比对")
|
|
for id_new in new_problems_dict:
|
|
count += 1
|
|
if count % 50 == 0:
|
|
print(count)
|
|
for id_old in old_problems_dict:
|
|
similar_rate = sim_test(new_problems_dict_content[id_new],old_problems_dict_content[id_old])
|
|
if similar_rate > threshold or id_new in old_problems_dict[id_old]["related"] or id_new in old_problems_dict[id_old]["same"] or id_old in new_problems_dict[id_new]["related"] or id_old in new_problems_dict[id_new]["same"]:
|
|
suspect_count += 1
|
|
if not (id_new in old_problems_dict[id_old]["related"] or id_new in old_problems_dict[id_old]["same"] or id_new in old_problems_dict[id_old]["unrelated"] or id_old in new_problems_dict[id_new]["related"] or id_old in new_problems_dict[id_new]["same"] or id_old in new_problems_dict[id_new]["unrelated"]):
|
|
alike_problems += ("%.4f" %similar_rate) + "\n\n" + id_new + " " + new_problems_dict[id_new]["content"] + "\n\n" + id_old + " " + old_problems_dict[id_old]["content"] + "\n\n"
|
|
else:
|
|
remarked += 1
|
|
|
|
#开始新题之间的比对
|
|
count = 0
|
|
print("开始新题之间的比对")
|
|
while len(new_problems_dict) >= 2:
|
|
count += 1
|
|
if count % 50 == 0:
|
|
print(count)
|
|
keys = list(new_problems_dict.keys())
|
|
current_problem = new_problems_dict.pop(keys[0])
|
|
current_problem_content = new_problems_dict_content[current_problem["id"]]
|
|
for id_new in new_problems_dict:
|
|
similar_rate = sim_test(new_problems_dict_content[id_new],current_problem_content)
|
|
if similar_rate > threshold or id_new in current_problem["related"] or id_new in current_problem["same"] or current_problem["id"] in new_problems_dict[id_new]["related"] or current_problem["id"] in new_problems_dict[id_new]["same"]:
|
|
suspect_count += 1
|
|
if not (id_new in current_problem["related"] or id_new in current_problem["same"] or id_new in current_problem["unrelated"] or current_problem["id"] in new_problems_dict[id_new]["related"] or current_problem["id"] in new_problems_dict[id_new]["same"] or current_problem["id"] in new_problems_dict[id_new]["unrelated"]):
|
|
alike_problems += ("%.4f" %similar_rate) + "\n\n" + id_new + " " + new_problems_dict[id_new]["content"] + "\n\n" + current_problem["id"] + " " + current_problem["content"] + "\n\n"
|
|
else:
|
|
remarked += 1
|
|
|
|
|
|
#记录终止时间及显示结果
|
|
end_time = time.time()
|
|
print("总耗时:",end_time-start_time,"秒.")
|
|
print("发现相似: ",suspect_count,", 其中已标注: ",remarked,".")
|
|
|
|
with open("临时文件/相似题目.txt","w",encoding="utf8") as f:
|
|
f.write(alike_problems) |