import json,re,os,Levenshtein def load_dict(filename): #根据filename读取json数据库并转化为python字典 with open(filename,"r",encoding = "u8") as f: adict = json.loads(f.read()) return adict #返回python字典 def save_dict(adict,filename): #将adict字典转化为json文件并保存至filename文件中 try: with open(filename,"w",encoding = "u8") as f: f.write(json.dumps(adict,indent=4,ensure_ascii=False)) return 0 #成功则返回0 except: return 1 #不成功则返回1 def pre_treating(string): #删除字符串中对比较无用的字符, 以供比较 string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string) string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string) string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string) string = re.sub(r"[\n\t]","",string) string = re.sub(r"(displaystyle)|(overrightarrow)|(overline)","",string) string = re.sub(r"[,\.:;?]","",string) return string #返回处理后的字符串 def treat_dict(p_dict): #对整个题库字典中的内容部分进行预处理,删除无用字符 treated_dict = {} for id in p_dict: treated_dict[id] = {} treated_dict[id]["content"] = pre_treating(p_dict[id]["content"]) treated_dict[id]["same"] = p_dict[id]["same"] return treated_dict #返回处理后的字典, 含内容字段及相同题目字段 def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程度最大的题目(除外列表之外的部分) maxsim = -1 argmaxsim = "000000" for id in adict: if not id in excludelist: simrate = Levenshtein.jaro(adict[id]["content"],adict[currentid]["content"]) if simrate > maxsim: maxsim = simrate argmaxsim = id return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号 def generate_problem_series(startingid,length,adict): #在adict字典里返回从startingid开始的一系列题号, 每一题都是与上一题的关联程度最大的 excludelist = [startingid] currentid = startingid for i in range(length): maxsim,currentid = detectmaxsim(currentid,excludelist,adict) excludelist.append(currentid) return ",".join(excludelist) #返回按顺序的题号列表 if __name__ == "__main__": print("数据库工具, import用.")