102 lines
2.9 KiB
Python
102 lines
2.9 KiB
Python
import os,re,difflib,Levenshtein,time,json
|
|
|
|
# 相同题目的阈值
|
|
threshold = 0.99
|
|
|
|
outputfile = r"临时文件/相同题目列表.txt"
|
|
|
|
#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间
|
|
def generate_number_set(string):
|
|
string = re.sub(r"[\n\s]","",string)
|
|
string_list = string.split(",")
|
|
numbers_list = []
|
|
for s in string_list:
|
|
if not ":" in s:
|
|
numbers_list.append(s.zfill(6))
|
|
else:
|
|
start,end = s.split(":")
|
|
for ind in range(int(start),int(end)+1):
|
|
numbers_list.append(str(ind).zfill(6))
|
|
return numbers_list
|
|
|
|
#字符串预处理
|
|
def pre_treating(string):
|
|
string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string)
|
|
string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string)
|
|
string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string)
|
|
string = re.sub(r"[\n\t]","",string)
|
|
string = re.sub(r"(displaystyle)|(overrightarrow)","",string)
|
|
string = re.sub(r"[,\.:;?]","",string)
|
|
return string
|
|
|
|
#difflab字符串比较
|
|
def difflab_get_equal_rate(str1, str2):
|
|
# str1 = pre_treating(str1)
|
|
# str2 = pre_treating(str2)
|
|
return difflib.SequenceMatcher(None, str1, str2).ratio()
|
|
|
|
#Levenshtein jaro字符串比较
|
|
def jaro_get_equal_rate(str1,str2):
|
|
# str1 = pre_treating(str1)
|
|
# str2 = pre_treating(str2)
|
|
return Levenshtein.jaro(str1,str2)
|
|
|
|
#Levenshtein 字符串比较
|
|
def Lev_get_equal_rate(str1,str2):
|
|
# str1 = pre_treating(str1)
|
|
# str2 = pre_treating(str2)
|
|
return Levenshtein.ratio(str1,str2)
|
|
|
|
|
|
|
|
|
|
#指定对比方法
|
|
sim_test = jaro_get_equal_rate
|
|
|
|
#读入题库
|
|
with open(r"../题库0.3/Problems.json","r",encoding = "utf8") as f:
|
|
database = f.read()
|
|
pro_dict = json.loads(database)
|
|
|
|
pro_dict_treated = {}
|
|
for id in pro_dict:
|
|
pro_dict_treated[id] = pro_dict[id].copy()
|
|
pro_dict_treated[id]["content"] = pre_treating(pro_dict_treated[id]["content"])
|
|
|
|
|
|
print("题目数:",len(pro_dict))
|
|
|
|
#记录起始时间
|
|
starttime = time.time()
|
|
alike_problems = ""
|
|
|
|
|
|
count = 0
|
|
keys = list(pro_dict_treated.keys())
|
|
while len(keys) >= 2:
|
|
count += 1
|
|
if count % 500 == 0:
|
|
print(count)
|
|
|
|
currentid = keys.pop(0)
|
|
content1 = pro_dict_treated[currentid]["content"]
|
|
same = []
|
|
for id in keys:
|
|
if not id in pro_dict[currentid]["same"] and not id in pro_dict[currentid]["related"]:
|
|
content2 = pro_dict_treated[id]["content"]
|
|
if sim_test(content1,content2)>threshold:
|
|
same.append(id)
|
|
if len(same) >= 1:
|
|
# print(currentid)
|
|
alike_problems += currentid + ","
|
|
for i in same:
|
|
# print(i)
|
|
keys.pop(keys.index(i))
|
|
alike_problems += ",".join(same)
|
|
alike_problems += "\n\n"
|
|
|
|
endtime = time.time()
|
|
print("耗时: %.3f秒" %(endtime-starttime))
|
|
|
|
with open(outputfile,"w",encoding = "u8") as f:
|
|
f.write(alike_problems) |