This repository has been archived on 2024-06-23. You can view files and clone it, but cannot push or open issues or pull requests.
mathdeptv2/工具/database_tools.py

105 lines
4.3 KiB
Python

import json,re,os,Levenshtein,fitz
#读取存储json数据库相关(不限于题号数据库)
def load_dict(filename): #根据filename读取json数据库并转化为python字典
with open(filename,"r",encoding = "u8") as f:
adict = json.loads(f.read())
return adict #返回python字典
def save_dict(adict,filename): #将adict字典转化为json文件并保存至filename文件中
try:
with open(filename,"w",encoding = "u8") as f:
f.write(json.dumps(adict,indent=4,ensure_ascii=False))
return 0 #成功则返回0
except:
return 1 #不成功则返回1
def pre_treating(string): #删除字符串中对比较无用的字符, 以供比较
string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string)
string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string)
string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string)
string = re.sub(r"[\n\t]","",string)
string = re.sub(r"(displaystyle)|(overrightarrow)|(overline)","",string)
string = re.sub(r"[,\.:;?]","",string)
return string #返回处理后的字符串
def treat_dict(p_dict): #对整个题库字典中的内容部分进行预处理,删除无用字符
treated_dict = {}
for id in p_dict:
treated_dict[id] = {}
treated_dict[id]["content"] = pre_treating(p_dict[id]["content"])
treated_dict[id]["same"] = p_dict[id]["same"]
return treated_dict #返回处理后的字典, 含内容字段及相同题目字段
def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程度最大的题目(除外列表之外的部分)
maxsim = -1
argmaxsim = "000000"
for id in adict:
if not id in excludelist:
simrate = Levenshtein.jaro(adict[id]["content"],adict[currentid]["content"])
if simrate > maxsim:
maxsim = simrate
argmaxsim = id
return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号
def generate_problem_series(startingid,length,adict): #在adict字典里返回从startingid开始的一系列题号, 每一题都是与上一题的关联程度最大的
excludelist = [startingid]
currentid = startingid
for i in range(length):
maxsim,currentid = detectmaxsim(currentid,excludelist,adict)
excludelist.append(currentid)
return ",".join(excludelist) #返回按顺序的题号列表
def generate_number_set(string): #根据可能含有":"和","的题号字符串生成一个用逗号分隔的六位题号列表, 例如"1:3,5"会生成["000001","000002","000003","000005"]
string = re.sub(r"[\n\s]","",string)
string_list = string.split(",")
numbers_list = []
for s in string_list:
if not ":" in s:
numbers_list.append(s.zfill(6))
else:
start,end = s.split(":")
for ind in range(int(start),int(end)+1):
numbers_list.append(str(ind).zfill(6))
return numbers_list #返回六位题号列表
def generate_exp(id_list): #根据题号列表生成字符串式的含":"和","的题号字符串, 例如["000001","000002","000003","000005"]生成"000001:000003,000005", 若列表为空则生成"无有效题号"
if not len(id_list) == 0:
exp_list = []
start = id_list[0]
current = start
end = start
for id in id_list[1:]:
# print(id,current)
if int(id)-1 == int(current):
current = id
end = id
else:
if not start == end:
exp_list.append('"'+start+":"+end+'"')
else:
exp_list.append('"'+start+'"')
start = id
current = id
end = id
if not start == end:
exp_list.append('"'+start+":"+end+'"')
else:
exp_list.append('"'+start+'"')
exp_str = ",".join(exp_list).replace('"',"")
else:
exp_str = "无有效题号"
return exp_str #返回含有":"或","的题号字符串
def parsePDF(filePath): #提取pdf文件中的字符
with fitz.open(filePath) as doc:
text = ""
for page in doc.pages():
text += page.get_text() + "\n"
return text
if __name__ == "__main__":
print("数据库工具, import用.")