import json,re,os,Levenshtein,fitz #读取存储json数据库相关(不限于题号数据库) def load_dict(filename): #根据filename读取json数据库并转化为python字典 with open(filename,"r",encoding = "u8") as f: adict = json.loads(f.read()) return adict #返回python字典 def save_dict(adict,filename): #将adict字典转化为json文件并保存至filename文件中 try: with open(filename,"w",encoding = "u8") as f: f.write(json.dumps(adict,indent=4,ensure_ascii=False)) return 0 #成功则返回0 except: return 1 #不成功则返回1 def pre_treating(string): #删除字符串中对比较无用的字符, 以供比较 string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string) string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string) string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string) string = re.sub(r"[\n\t]","",string) string = re.sub(r"(displaystyle)|(overrightarrow)|(overline)","",string) string = re.sub(r"[,\.:;?]","",string) return string #返回处理后的字符串 def treat_dict(p_dict): #对整个题库字典中的内容部分进行预处理,删除无用字符 treated_dict = {} for id in p_dict: treated_dict[id] = {} treated_dict[id]["content"] = pre_treating(p_dict[id]["content"]) treated_dict[id]["same"] = p_dict[id]["same"] return treated_dict #返回处理后的字典, 含内容字段及相同题目字段 def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程度最大的题目(除外列表之外的部分) maxsim = -1 argmaxsim = "000000" for id in adict: if not id in excludelist: simrate = Levenshtein.jaro(adict[id]["content"],adict[currentid]["content"]) if simrate > maxsim: maxsim = simrate argmaxsim = id return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号 def generate_problem_series(startingid,length,adict): #在adict字典里返回从startingid开始的一系列题号, 每一题都是与上一题的关联程度最大的 excludelist = [startingid] currentid = startingid for i in range(length): maxsim,currentid = detectmaxsim(currentid,excludelist,adict) excludelist.append(currentid) return ",".join(excludelist) #返回按顺序的题号列表 def generate_number_set(string): #根据可能含有":"和","的题号字符串生成一个用逗号分隔的六位题号列表, 例如"1:3,5"会生成["000001","000002","000003","000005"] string = re.sub(r"[\n\s]","",string) string_list = string.split(",") numbers_list = [] for s in string_list: if not ":" in s: numbers_list.append(s.zfill(6)) else: start,end = s.split(":") for ind in range(int(start),int(end)+1): numbers_list.append(str(ind).zfill(6)) return numbers_list #返回六位题号列表 def generate_exp(id_list): #根据题号列表生成字符串式的含":"和","的题号字符串, 例如["000001","000002","000003","000005"]生成"000001:000003,000005", 若列表为空则生成"无有效题号" if not len(id_list) == 0: exp_list = [] start = id_list[0] current = start end = start for id in id_list[1:]: # print(id,current) if int(id)-1 == int(current): current = id end = id else: if not start == end: exp_list.append('"'+start+":"+end+'"') else: exp_list.append('"'+start+'"') start = id current = id end = id if not start == end: exp_list.append('"'+start+":"+end+'"') else: exp_list.append('"'+start+'"') exp_str = ",".join(exp_list).replace('"',"") else: exp_str = "无有效题号" return exp_str #返回含有":"或","的题号字符串 def parsePDF(filePath): #提取pdf文件中的字符 with fitz.open(filePath) as doc: text = "" for page in doc.pages(): text += page.get_text() + "\n" return text if __name__ == "__main__": print("数据库工具, import用.")