294 lines
13 KiB
Python
294 lines
13 KiB
Python
import json,re,os,Levenshtein,fitz,time
|
|
|
|
def GetDate(): #获得当前日期
|
|
currentdate = str(time.localtime().tm_year)+str(time.localtime().tm_mon).zfill(2)+str(time.localtime().tm_mday).zfill(2)
|
|
return currentdate #返回当前日期yyyymmdd
|
|
|
|
def ReadTextFile(filepath): #读取文本格式的文件
|
|
with open(filepath,"r",encoding="u8") as f:
|
|
data = f.read()
|
|
return data #返回文本格式文件的内容
|
|
|
|
def SortDict(adict): #按字典项顺序排序字典
|
|
return dict(sorted(adict.items())) #返回排序后的字典
|
|
|
|
|
|
#读取存储json数据库相关(不限于题号数据库)
|
|
|
|
def load_dict(filename): #根据filename读取json数据库并转化为python字典
|
|
with open(filename,"r",encoding = "u8") as f:
|
|
adict = json.loads(f.read())
|
|
return adict #返回python字典
|
|
|
|
def save_dict(adict,filename): #将adict字典转化为json文件并保存至filename文件中
|
|
try:
|
|
with open(filename,"w",encoding = "u8") as f:
|
|
f.write(json.dumps(adict,indent=4,ensure_ascii=False))
|
|
return 0 #成功则返回0
|
|
except:
|
|
return 1 #不成功则返回1
|
|
|
|
|
|
def pre_treating(string): #删除字符串中对比较无用的字符, 以供比较
|
|
string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string)
|
|
string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string)
|
|
string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string)
|
|
string = re.sub(r"[\n\t]","",string)
|
|
string = re.sub(r"(displaystyle)|(overrightarrow)|(overline)","",string)
|
|
string = re.sub(r"[,\.:;?]","",string)
|
|
return string #返回处理后的字符串
|
|
|
|
def treat_dict(p_dict): #对整个题库字典中的内容部分进行预处理,删除无用字符
|
|
treated_dict = {}
|
|
for id in p_dict:
|
|
treated_dict[id] = {}
|
|
treated_dict[id]["content"] = pre_treating(p_dict[id]["content"])
|
|
treated_dict[id]["same"] = p_dict[id]["same"]
|
|
return treated_dict #返回处理后的字典, 含内容字段及相同题目字段
|
|
|
|
def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程度最大的题目(除外列表之外的部分)
|
|
maxsim = -1
|
|
argmaxsim = "000000"
|
|
for id in adict:
|
|
if not id in excludelist:
|
|
simrate = Levenshtein.jaro(adict[id]["content"],adict[currentid]["content"])
|
|
if simrate > maxsim:
|
|
maxsim = simrate
|
|
argmaxsim = id
|
|
return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号
|
|
|
|
def generate_problem_series(startingid,length,adict): #在adict字典里返回从startingid开始的一系列题号, 每一题都是与上一题的关联程度最大的
|
|
excludelist = [startingid]
|
|
currentid = startingid
|
|
for i in range(length):
|
|
maxsim,currentid = detectmaxsim(currentid,excludelist,adict)
|
|
excludelist.append(currentid)
|
|
return ",".join(excludelist) #返回按顺序的题号列表
|
|
|
|
|
|
def generate_number_set(string,*thedict): #根据可能含有":"和","的题号字符串生成一个用逗号分隔的六位题号列表, 例如"1:3,5"会生成["000001","000002","000003","000005"]
|
|
#可变参数*dict如果存在, 将只生成dict的keys中包含的题号列表
|
|
string = re.sub(r"[\n\s]","",string)
|
|
string_list = string.split(",")
|
|
numbers_list = []
|
|
for s in string_list:
|
|
if not ":" in s:
|
|
numbers_list.append(s.zfill(6))
|
|
else:
|
|
start,end = s.split(":")
|
|
for ind in range(int(start),int(end)+1):
|
|
numbers_list.append(str(ind).zfill(6))
|
|
if len(thedict) == 0:
|
|
return numbers_list #返回六位题号列表
|
|
elif len(thedict) == 1 and type(thedict[0]) == dict:
|
|
numbers_list = [id for id in numbers_list if id in thedict[0]]
|
|
return numbers_list #返回字典中存在的六位题号列表
|
|
else:
|
|
return "输入参数有误"
|
|
|
|
def generate_exp(id_list): #根据题号列表生成字符串式的含":"和","的题号字符串, 例如["000001","000002","000003","000005"]生成"000001:000003,000005", 若列表为空则生成"无有效题号"
|
|
if not len(id_list) == 0:
|
|
exp_list = []
|
|
start = id_list[0]
|
|
current = start
|
|
end = start
|
|
for id in id_list[1:]:
|
|
# print(id,current)
|
|
if int(id)-1 == int(current):
|
|
current = id
|
|
end = id
|
|
else:
|
|
if not start == end:
|
|
exp_list.append('"'+start+":"+end+'"')
|
|
else:
|
|
exp_list.append('"'+start+'"')
|
|
start = id
|
|
current = id
|
|
end = id
|
|
if not start == end:
|
|
exp_list.append('"'+start+":"+end+'"')
|
|
else:
|
|
exp_list.append('"'+start+'"')
|
|
exp_str = ",".join(exp_list).replace('"',"")
|
|
else:
|
|
exp_str = "无有效题号"
|
|
return exp_str #返回含有":"或","的题号字符串
|
|
|
|
def parsePDF(filePath): #提取pdf文件中的字符
|
|
with fitz.open(filePath) as doc:
|
|
text = ""
|
|
for page in doc.pages():
|
|
text += page.get_text() + "\n"
|
|
return text
|
|
|
|
def extractIDs(filePath): #提取.txt,.tex或.pdf文件中的题号, 返回含有":"或","的题号字符串
|
|
if filePath[-4:] == ".txt" or filePath[-4:] == ".tex":
|
|
with open(filePath,"r",encoding = "u8") as f:
|
|
data = f.read()
|
|
elif filePath[-4:] == ".pdf":
|
|
data = parsePDF(filePath)
|
|
else:
|
|
return "格式不正确"
|
|
ids = re.findall(r"\((\d{6})\)",data)
|
|
return generate_exp(ids)
|
|
|
|
|
|
def spareIDs(dictname): #返回空闲题号
|
|
idlist = list(dictname.keys())
|
|
used_str = generate_exp(idlist)
|
|
used_list = used_str.split(",")
|
|
output = ""
|
|
for group in range(len(used_list)-1):
|
|
output += "首个空闲id: %s, 直至: %s"%(str(int(used_list[group][-6:])+1).zfill(6),str(int(used_list[group+1][:6])-1).zfill(6)) + "\n"
|
|
output += "首个空闲id: %s, 直至: %s"%(str(int(used_list[-1][-6:])+1).zfill(6),"999999")
|
|
return output #返回的是一个多行的字符串, 每一行中含有一个空闲题号的闭区间
|
|
|
|
def parse_usage(datastring): #对单个usages中的项的结果进行分词
|
|
datastring = re.sub(r"\s+","\t",datastring.strip())
|
|
datalist = datastring.split("\t")
|
|
date = ""
|
|
classname = ""
|
|
diff = []
|
|
for item in datalist:
|
|
if not "." in item and not "高" in item and not "班" in item:
|
|
date = item
|
|
elif "高" in item or "班" in item:
|
|
classname = item
|
|
else:
|
|
diff.append(item)
|
|
return({"date":date,"classname":classname,"difficulty":diff}) #返回一个字典, "date"表示日期, "classname"表示班级, "difficultiy"表示难度列表
|
|
|
|
|
|
def GenerateProblemListFromString(data): #从来自.tex文件的字符串生成题目列表, 每个item是一道题目, 新一行的%用作前缀
|
|
try:
|
|
data = re.findall(r"\\begin\{document\}([\s\S]*?)\\end\{document\}",data)[0]
|
|
except:
|
|
pass
|
|
data = re.sub(r"\n{2,}","\n",data)
|
|
data = re.sub(r"\\item",r"\\enditem\\item",data)
|
|
data = re.sub(r"\\end\{enumerate\}",r"\\enditem",data) #切除无关信息, 保留关键信息
|
|
problempositions = []
|
|
for item in re.finditer(r"\\item([\s\S]*?)\\enditem",data):
|
|
problempositions.append(item.regs[1]) #确定题目内容所在位置
|
|
problem_list = []
|
|
for pos in problempositions:
|
|
content = data[pos[0]:pos[1]].strip()
|
|
content = re.sub(r"\n\%[\s\S]*$","",content) #题目内容
|
|
subdata = data[:pos[0]] #开始寻找出处中缀
|
|
suflist = re.findall(r"\n(\%\s{0,}[\S]+)\n",subdata)
|
|
if len(suflist) == 0:
|
|
suffix = ""
|
|
else:
|
|
suffix = suflist[-1].replace("%","").strip()
|
|
problem_list.append((content,suffix))
|
|
return problem_list #返回一个列表, 每一项是一个由 题目内容 和 题目来源前缀 组成的元组
|
|
|
|
|
|
def CreateEmptyProblem(problem): # 根据已有的题目创建新的空题目
|
|
NewProblem = problem.copy()
|
|
for field in NewProblem:
|
|
if type(NewProblem[field]) == str:
|
|
NewProblem[field] = ""
|
|
elif type(NewProblem[field]) == list:
|
|
NewProblem[field] = []
|
|
elif type(NewProblem[field]) == int or type(NewProblem[field]) == float:
|
|
NewProblem[field] = -1
|
|
return NewProblem #返回一个空题目的字典, ID和内容待赋值
|
|
|
|
def CreateNewProblem(id,content,origin,dict,editor): # 构建一道新题目的字典
|
|
NewProblem = CreateEmptyProblem(dict["000001"])
|
|
NewProblem["id"] = str(id).zfill(6)
|
|
NewProblem["content"] = content
|
|
NewProblem["origin"] = origin
|
|
NewProblem["edit"] = [editor]
|
|
return NewProblem # 返回一道新题目的字典, 已赋新的ID, 内容, 来源和编辑者
|
|
|
|
|
|
def AddProblemstoDict(startingid,raworigin,problems,editor,indexdescription,thedict): #将来自GenerateProblemListFromString的列表中的题目添加到thedict字典
|
|
id = int(startingid)
|
|
currentsuffix = problems[0][1]
|
|
problemindex = 0
|
|
for p_and_suffix in problems:
|
|
p, suffix = p_and_suffix
|
|
pid = str(id).zfill(6)
|
|
if pid in thedict:
|
|
print("ID %s 已被使用."%pid)
|
|
return 1
|
|
else:
|
|
if suffix == currentsuffix:
|
|
problemindex += 1
|
|
else:
|
|
problemindex = 1
|
|
origin = raworigin + suffix + indexdescription.strip() + ("" if indexdescription.strip() == "" else str(problemindex))
|
|
newproblem = CreateNewProblem(pid,p.strip(),origin,thedict,GetDate() + "\t" + editor)
|
|
if "blank" in p:
|
|
newproblem["genre"] = "填空题"
|
|
elif "bracket" in p:
|
|
newproblem["genre"] = "选择题"
|
|
else:
|
|
newproblem["genre"] = "解答题"
|
|
thedict[pid] = newproblem
|
|
maxsim,argmaxsim = detectmaxsim(pid,[pid],thedict)
|
|
print("已收录题号: %s, 最接近题目: %s, 相似程度: %.3f, 题目类型: %s, 题目内容: %s"%(pid,argmaxsim,maxsim,newproblem["genre"],p))
|
|
id += 1
|
|
return 0
|
|
|
|
|
|
def CreateIDLinks(old_id_list,new_id_list,*thedict): #建立已有id和新id之间的联系, thedict为可选, 选中的话即为当前字典, 会从new_id_list中排除当前字典中有的项
|
|
if len(thedict) == 1 and type(thedict[0]) == dict:
|
|
new_id_list = [id for id in new_id_list if not id in thedict[0]]
|
|
if len(old_id_list)>len(new_id_list):
|
|
return "新ID个数不足."
|
|
else:
|
|
id_links = []
|
|
for i in range(len(old_id_list)):
|
|
id_links.append((old_id_list[i],new_id_list[i]))
|
|
return id_links # 返回id联系, 每个元组表示一对id, 前者是旧id, 后者是新id
|
|
|
|
|
|
def CreateRelatedProblems(links,thedict,filepath): # 根据links关联生成待编辑的新题目字典, 等待编辑修改
|
|
try:
|
|
new_dict = {}
|
|
for item in links:
|
|
old_id,new_id = item
|
|
new_dict[old_id] = thedict[old_id].copy()
|
|
new_dict[old_id]["id"] = new_id + "待替换"
|
|
new_dict[old_id]["content"] = "(待编辑)" + new_dict[old_id]["content"]
|
|
new_dict[old_id]["usages"] = []
|
|
new_dict[old_id]["same"] = []
|
|
new_dict[old_id]["unrelated"] = []
|
|
new_dict[old_id]["edit"] = new_dict[old_id]["edit"].copy() + [GetDate()+"\t"]
|
|
new_dict[old_id]["origin"] += "-" + GetDate() + "修改"
|
|
save_dict(new_dict,filepath)
|
|
except:
|
|
return 1 #异常返回1
|
|
return 0 #正常返回0
|
|
|
|
def ImportRelatedProblems(new_json,main_json): # 导入编辑过的关联题目json文件到主数据库
|
|
pro_dict = load_dict(main_json)
|
|
new_dict = load_dict(new_json)
|
|
for id in new_dict:
|
|
new_id = new_dict[id]["id"].replace("待替换","") #新题号后需要跟"待替换"字样
|
|
if new_id in pro_dict:
|
|
print("题号有重复")
|
|
return 1
|
|
else:
|
|
pro_dict[new_id] = new_dict[id].copy()
|
|
pro_dict[new_id]["id"] = new_id
|
|
pro_dict[id]["related"] += [new_id]
|
|
pro_dict[new_id]["related"] += [id]
|
|
print("导入关联题目 %s -> %s 信息成功."%(id,new_id))
|
|
save_dict(SortDict(pro_dict),main_json) #保存至目标pro_dict文件
|
|
return 0 #正常返回0
|
|
|
|
|
|
def strip_suffix(originalString, suf_words_list): # 字符串去除指定后缀
|
|
for sw in suf_words_list:
|
|
output = re.sub(sw+r"[\S]*$","",originalString)
|
|
return(output) # 返回原字符串中截去suf_words_list及之后字符的部分
|
|
|
|
def get_striped_origin(pro_dict,id,suf_words_list): # 题目来源去除指定后缀
|
|
return strip_suffix(pro_dict[id]["origin"],suf_words_list) # 返回去除指定后缀后的题目来源
|
|
|
|
if __name__ == "__main__":
|
|
print("数据库工具, import用.") |