109 lines
3.5 KiB
Python
109 lines
3.5 KiB
Python
import os,re,difflib,Levenshtein,time,json
|
|
|
|
# 重要!!! 范围
|
|
old_problems_range = "1:50000"
|
|
threshold = 0.85
|
|
|
|
# 待比对的文件
|
|
filename = r"C:\Users\weiye\Documents\wwy sync\临时工作区\自拟题目14.tex"
|
|
|
|
#生成数码列表, 逗号分隔每个区块, 区块内部用:表示整数闭区间
|
|
def generate_number_set(string):
|
|
string = re.sub(r"[\n\s]","",string)
|
|
string_list = string.split(",")
|
|
numbers_list = []
|
|
for s in string_list:
|
|
if not ":" in s:
|
|
numbers_list.append(s.zfill(6))
|
|
else:
|
|
start,end = s.split(":")
|
|
for ind in range(int(start),int(end)+1):
|
|
numbers_list.append(str(ind).zfill(6))
|
|
return numbers_list
|
|
|
|
#字符串预处理
|
|
def pre_treating(string):
|
|
string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string)
|
|
string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)|(mathrm)|(text)","",string)
|
|
string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string)
|
|
string = re.sub(r"[\n\t]","",string)
|
|
string = re.sub(r"(displaystyle)|(overrightarrow)","",string)
|
|
string = re.sub(r"[,\.:;?]","",string)
|
|
return string
|
|
|
|
#difflab字符串比较
|
|
def difflab_get_equal_rate(str1, str2):
|
|
return difflib.SequenceMatcher(None, str1, str2).ratio()
|
|
|
|
#Levenshtein jaro字符串比较
|
|
def jaro_get_equal_rate(str1,str2):
|
|
return Levenshtein.jaro(str1,str2)
|
|
|
|
#Levenshtein 字符串比较
|
|
def Lev_get_equal_rate(str1,str2):
|
|
return Levenshtein.ratio(str1,str2)
|
|
|
|
def GenerateProblemListFromString(problem_string):
|
|
try:
|
|
data = re.findall(r"\\begin\{document\}([\s\S]*?)\\end\{document\}",problem_string)[0]
|
|
except:
|
|
data = problem_string
|
|
data = re.sub(r"\n{2,}","\n",data)
|
|
data = re.sub(r"\\item",r"\\enditem\\item",data)
|
|
data = re.sub(r"\\end\{enumerate\}",r"\\enditem",data)
|
|
ProblemList_raw = [p.strip() for p in re.findall(r"\\item([\s\S]*?)\\enditem",data)]
|
|
ProblemsList = []
|
|
for p in ProblemList_raw:
|
|
startpos = data.index(p)
|
|
tempdata = data[:startpos]
|
|
suflist = re.findall(r"\n\%[\dA-Za-z]+",tempdata)
|
|
if len(suflist) > 0:
|
|
suffix = suflist[-1].replace("%","").strip()
|
|
else:
|
|
suffix = ""
|
|
ProblemsList.append((p,suffix))
|
|
return ProblemsList
|
|
|
|
|
|
#指定对比方法
|
|
sim_test = jaro_get_equal_rate
|
|
|
|
#读入题库
|
|
with open(r"../题库0.3/Problems.json","r",encoding = "utf8") as f:
|
|
database = f.read()
|
|
pro_dict = json.loads(database)
|
|
|
|
output = ""
|
|
|
|
with open(filename,"r",encoding="u8") as f:
|
|
newdatabase = f.read()
|
|
new_pro_list = GenerateProblemListFromString(newdatabase)
|
|
|
|
pro_dict_treated = {}
|
|
idrange_raw = generate_number_set(old_problems_range)
|
|
idrange = [id for id in pro_dict if id in idrange_raw]
|
|
for p in idrange:
|
|
pro_dict_treated[p] = pre_treating(pro_dict[p]["content"])
|
|
|
|
new_dict_treated = {}
|
|
for i in range(len(new_pro_list)):
|
|
new_dict_treated[i+1] = pre_treating(new_pro_list[i][0])
|
|
|
|
for i in new_dict_treated:
|
|
new_p = new_dict_treated[i]
|
|
maxsim = 0
|
|
for p in pro_dict_treated:
|
|
old_p = pro_dict_treated[p]
|
|
sim = sim_test(new_p,old_p)
|
|
if sim > maxsim:
|
|
maxsim = sim
|
|
argmax = p
|
|
print("%.3f\t%d\t%s" %(maxsim,i,argmax))
|
|
output += ("%.3f\t%d\t%s" %(maxsim,i,argmax)) + "\n"
|
|
# print("\n新题: %s" %new_pro_list[i-1][0])
|
|
# print("\n原题: %s\n\n\n" %pro_dict[]["content"])
|
|
|
|
with open("临时文件/新题相似相同.txt","w",encoding = "u8") as f:
|
|
f.write(output)
|
|
|