From b1a45c6518b9fc674c1f9f5174a3c6f4416b708a Mon Sep 17 00:00:00 2001 From: "weiye.wang" Date: Sat, 24 Jun 2023 16:37:08 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=B7=A5=E5=85=B7v2=E7=9B=AE?= =?UTF-8?q?=E5=BD=95,=20=E7=BB=A7=E7=BB=AD=E4=B8=B0=E5=AF=8Cdatabase=5Ftoo?= =?UTF-8?q?ls,=20=E6=96=B0=E5=8A=9F=E8=83=BD=E5=B7=B2=E7=A7=BB=E8=87=B3?= =?UTF-8?q?=E5=B7=A5=E5=85=B7v2=E7=9B=AE=E5=BD=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 工具/database_tools_obsolete.py | 255 +++++++++++++++++++++++++++++ {工具 => 工具v2}/database_tools.py | 51 +++++- 工具v2/寻找空闲题号.py | 5 + 工具v2/批量收录题目.py | 16 ++ 4 files changed, 325 insertions(+), 2 deletions(-) create mode 100644 工具/database_tools_obsolete.py rename {工具 => 工具v2}/database_tools.py (82%) create mode 100644 工具v2/寻找空闲题号.py create mode 100644 工具v2/批量收录题目.py diff --git a/工具/database_tools_obsolete.py b/工具/database_tools_obsolete.py new file mode 100644 index 00000000..fd4bdc5b --- /dev/null +++ b/工具/database_tools_obsolete.py @@ -0,0 +1,255 @@ +import json,re,os,Levenshtein,fitz,time + +def GetDate(): #获得当前日期 + currentdate = str(time.localtime().tm_year)+str(time.localtime().tm_mon).zfill(2)+str(time.localtime().tm_mday).zfill(2) + return currentdate #返回当前日期yyyymmdd + +#读取存储json数据库相关(不限于题号数据库) + +def load_dict(filename): #根据filename读取json数据库并转化为python字典 + with open(filename,"r",encoding = "u8") as f: + adict = json.loads(f.read()) + return adict #返回python字典 + +def save_dict(adict,filename): #将adict字典转化为json文件并保存至filename文件中 + try: + with open(filename,"w",encoding = "u8") as f: + f.write(json.dumps(adict,indent=4,ensure_ascii=False)) + return 0 #成功则返回0 + except: + return 1 #不成功则返回1 + + +def pre_treating(string): #删除字符串中对比较无用的字符, 以供比较 + string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string) + string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string) + string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string) + string = re.sub(r"[\n\t]","",string) + string = re.sub(r"(displaystyle)|(overrightarrow)|(overline)","",string) + string = re.sub(r"[,\.:;?]","",string) + return string #返回处理后的字符串 + +def treat_dict(p_dict): #对整个题库字典中的内容部分进行预处理,删除无用字符 + treated_dict = {} + for id in p_dict: + treated_dict[id] = {} + treated_dict[id]["content"] = pre_treating(p_dict[id]["content"]) + treated_dict[id]["same"] = p_dict[id]["same"] + return treated_dict #返回处理后的字典, 含内容字段及相同题目字段 + +def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程度最大的题目(除外列表之外的部分) + maxsim = -1 + argmaxsim = "000000" + for id in adict: + if not id in excludelist: + simrate = Levenshtein.jaro(adict[id]["content"],adict[currentid]["content"]) + if simrate > maxsim: + maxsim = simrate + argmaxsim = id + return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号 + +def generate_problem_series(startingid,length,adict): #在adict字典里返回从startingid开始的一系列题号, 每一题都是与上一题的关联程度最大的 + excludelist = [startingid] + currentid = startingid + for i in range(length): + maxsim,currentid = detectmaxsim(currentid,excludelist,adict) + excludelist.append(currentid) + return ",".join(excludelist) #返回按顺序的题号列表 + + +def generate_number_set(string,*thedict): #根据可能含有":"和","的题号字符串生成一个用逗号分隔的六位题号列表, 例如"1:3,5"会生成["000001","000002","000003","000005"] +#可变参数*dict如果存在, 将只生成dict的keys中包含的题号列表 + string = re.sub(r"[\n\s]","",string) + string_list = string.split(",") + numbers_list = [] + for s in string_list: + if not ":" in s: + numbers_list.append(s.zfill(6)) + else: + start,end = s.split(":") + for ind in range(int(start),int(end)+1): + numbers_list.append(str(ind).zfill(6)) + if len(thedict) == 0: + return numbers_list #返回六位题号列表 + elif len(thedict) == 1 and type(thedict[0]) == dict: + numbers_list = [id for id in numbers_list if id in thedict[0]] + return numbers_list #返回字典中存在的六位题号列表 + else: + return "输入参数有误" + +def generate_exp(id_list): #根据题号列表生成字符串式的含":"和","的题号字符串, 例如["000001","000002","000003","000005"]生成"000001:000003,000005", 若列表为空则生成"无有效题号" + if not len(id_list) == 0: + exp_list = [] + start = id_list[0] + current = start + end = start + for id in id_list[1:]: + # print(id,current) + if int(id)-1 == int(current): + current = id + end = id + else: + if not start == end: + exp_list.append('"'+start+":"+end+'"') + else: + exp_list.append('"'+start+'"') + start = id + current = id + end = id + if not start == end: + exp_list.append('"'+start+":"+end+'"') + else: + exp_list.append('"'+start+'"') + exp_str = ",".join(exp_list).replace('"',"") + else: + exp_str = "无有效题号" + return exp_str #返回含有":"或","的题号字符串 + +def parsePDF(filePath): #提取pdf文件中的字符 + with fitz.open(filePath) as doc: + text = "" + for page in doc.pages(): + text += page.get_text() + "\n" + return text + +def extractIDs(filePath): #提取.txt,.tex或.pdf文件中的题号, 返回含有":"或","的题号字符串 + if filePath[-4:] == ".txt" or filePath[-4:] == ".tex": + with open(filePath,"r",encoding = "u8") as f: + data = f.read() + elif filePath[-4:] == ".pdf": + data = parsePDF(filePath) + else: + return "格式不正确" + ids = re.findall(r"\((\d{6})\)",data) + return generate_exp(ids) + + +def spareIDs(dictname): #返回空闲题号 + idlist = list(dictname.keys()) + used_str = generate_exp(idlist) + used_list = used_str.split(",") + output = "" + for group in range(len(used_list)-1): + output += "首个空闲id: %s, 直至: %s"%(str(int(used_list[group][-6:])+1).zfill(6),str(int(used_list[group+1][:6])-1).zfill(6)) + "\n" + output += "首个空闲id: %s, 直至: %s"%(str(int(used_list[-1][-6:])+1).zfill(6),"999999") + return output #返回的是一个多行的字符串, 每一行中含有一个空闲题号的闭区间 + +def parse_usage(datastring): #对单个usages中的项的结果进行分词 + datastring = re.sub(r"\s+","\t",datastring.strip()) + datalist = datastring.split("\t") + date = "" + classname = "" + diff = [] + for item in datalist: + if not "." in item and not "高" in item and not "班" in item: + date = item + elif "高" in item or "班" in item: + classname = item + else: + diff.append(item) + return({"date":date,"classname":classname,"difficulty":diff}) #返回一个字典, "date"表示日期, "classname"表示班级, "difficultiy"表示难度列表 + + +def GenerateProblemListFromString(data): #从来自.tex文件的字符串生成题目列表, 每个item是一道题目, 新一行的%用作前缀 + try: + data = re.findall(r"\\begin\{document\}([\s\S]*?)\\end\{document\}",data)[0] + except: + pass + data = re.sub(r"\n{2,}","\n",data) + data = re.sub(r"\\item",r"\\enditem\\item",data) + data = re.sub(r"\\end\{enumerate\}",r"\\enditem",data) #切除无关信息, 保留关键信息 + problempositions = [] + for item in re.finditer(r"\\item([\s\S]*?)\\enditem",data): + problempositions.append(item.regs[1]) #确定题目内容所在位置 + problem_list = [] + for pos in problempositions: + content = data[pos[0]:pos[1]].strip() + content = re.sub(r"\n\%[\s\S]*$","",content) #题目内容 + subdata = data[:pos[0]] #开始寻找出处中缀 + suflist = re.findall(r"\n(\%\s{0,}[\S]+)\n",subdata) + if len(suflist) == 0: + suffix = "" + else: + suffix = suflist[-1].replace("%","").strip() + problem_list.append((content,suffix)) + return problem_list #返回一个列表, 每一项是一个由 题目内容 和 题目来源前缀 组成的元组 + + +def CreateEmptyProblem(problem): # 根据已有的题目创建新的空题目 + NewProblem = problem.copy() + for field in NewProblem: + if type(NewProblem[field]) == str: + NewProblem[field] = "" + elif type(NewProblem[field]) == list: + NewProblem[field] = [] + elif type(NewProblem[field]) == int or type(NewProblem[field]) == float: + NewProblem[field] = -1 + return NewProblem #返回一个空题目的字典, ID和内容待赋值 + +# 创建新题目 +def CreateNewProblem(id,content,origin,dict,editor): # 构建一道新题目的字典 + NewProblem = CreateEmptyProblem(dict["000001"]) + NewProblem["id"] = str(id).zfill(6) + NewProblem["content"] = content + NewProblem["origin"] = origin + NewProblem["edit"] = [editor] + return NewProblem # 返回一道新题目的字典, 已赋新的ID, 内容, 来源和编辑者 + +def CreateIDLinks(old_id_list,new_id_list,*thedict): #建立已有id和新id之间的联系, thedict为可选, 选中的话即为当前字典, 会从new_id_list中排除当前字典中有的项 + if len(thedict) == 1 and type(thedict[0]) == dict: + new_id_list = [id for id in new_id_list if not id in thedict[0]] + if len(old_id_list)>len(new_id_list): + return "新ID个数不足." + else: + id_links = [] + for i in range(len(old_id_list)): + id_links.append((old_id_list[i],new_id_list[i])) + return id_links # 返回id联系, 每个元组表示一对id, 前者是旧id, 后者是新id + + +def CreateRelatedProblems(links,thedict,filepath): # 根据links关联生成待编辑的新题目字典, 等待编辑修改 + try: + new_dict = {} + for item in links: + old_id,new_id = item + new_dict[old_id] = thedict[old_id].copy() + new_dict[old_id]["id"] = new_id + "待替换" + new_dict[old_id]["content"] = "(待编辑)" + new_dict[old_id]["content"] + new_dict[old_id]["usages"] = [] + new_dict[old_id]["same"] = [] + new_dict[old_id]["unrelated"] = [] + new_dict[old_id]["edit"] = new_dict[old_id]["edit"].copy() + [GetDate()+"\t"] + new_dict[old_id]["origin"] += "-" + GetDate() + "修改" + save_dict(new_dict,filepath) + except: + return 1 #异常返回1 + return 0 #正常返回0 + +def ImportRelatedProblems(new_json,main_json): # 导入编辑过的关联题目json文件到主数据库 + pro_dict = load_dict(main_json) + new_dict = load_dict(new_json) + for id in new_dict: + new_id = new_dict[id]["id"].replace("待替换","") #新题号后需要跟"待替换"字样 + if new_id in pro_dict: + print("题号有重复") + return 1 + else: + pro_dict[new_id] = new_dict[id].copy() + pro_dict[new_id]["id"] = new_id + pro_dict[id]["related"] += [new_id] + pro_dict[new_id]["related"] += [id] + print("导入关联题目 %s -> %s 信息成功."%(id,new_id)) + save_dict(dict(sorted(pro_dict.items())),main_json) #保存至目标pro_dict文件 + return 0 #正常返回0 + + +def strip_suffix(originalString, suf_words_list): # 字符串去除指定后缀 + for sw in suf_words_list: + output = re.sub(sw+r"[\S]*$","",originalString) + return(output) # 返回原字符串中截去suf_words_list及之后字符的部分 + +def get_striped_origin(pro_dict,id,suf_words_list): # 题目来源去除指定后缀 + return strip_suffix(pro_dict[id]["origin"],suf_words_list) # 返回去除指定后缀后的题目来源 + +if __name__ == "__main__": + print("数据库工具, import用.") \ No newline at end of file diff --git a/工具/database_tools.py b/工具v2/database_tools.py similarity index 82% rename from 工具/database_tools.py rename to 工具v2/database_tools.py index 9ee6076f..17b35818 100644 --- a/工具/database_tools.py +++ b/工具v2/database_tools.py @@ -4,6 +4,15 @@ def GetDate(): #获得当前日期 currentdate = str(time.localtime().tm_year)+str(time.localtime().tm_mon).zfill(2)+str(time.localtime().tm_mday).zfill(2) return currentdate #返回当前日期yyyymmdd +def ReadTextFile(filepath): #读取文本格式的文件 + with open(filepath,"r",encoding="u8") as f: + data = f.read() + return data #返回文本格式文件的内容 + +def SortDict(adict): #按字典项顺序排序字典 + return dict(sorted(adict.items())) #返回排序后的字典 + + #读取存储json数据库相关(不限于题号数据库) def load_dict(filename): #根据filename读取json数据库并转化为python字典 @@ -186,7 +195,6 @@ def CreateEmptyProblem(problem): # 根据已有的题目创建新的空题目 NewProblem[field] = -1 return NewProblem #返回一个空题目的字典, ID和内容待赋值 -# 创建新题目 def CreateNewProblem(id,content,origin,dict,editor): # 构建一道新题目的字典 NewProblem = CreateEmptyProblem(dict["000001"]) NewProblem["id"] = str(id).zfill(6) @@ -195,6 +203,37 @@ def CreateNewProblem(id,content,origin,dict,editor): # 构建一道新题目的 NewProblem["edit"] = [editor] return NewProblem # 返回一道新题目的字典, 已赋新的ID, 内容, 来源和编辑者 + +def AddProblemstoDict(startingid,raworigin,problems,editor,indexdescription,thedict): #将来自GenerateProblemListFromString的列表中的题目添加到thedict字典 + id = int(startingid) + currentsuffix = problems[0][1] + problemindex = 0 + for p_and_suffix in problems: + p, suffix = p_and_suffix + pid = str(id).zfill(6) + if pid in thedict: + print("ID %s 已被使用."%pid) + return 1 + else: + if suffix == currentsuffix: + problemindex += 1 + else: + problemindex = 1 + origin = raworigin + suffix + indexdescription.strip() + ("" if indexdescription.strip() == "" else str(problemindex)) + newproblem = CreateNewProblem(pid,p.strip(),origin,thedict,GetDate() + "\t" + editor) + if "blank" in p: + newproblem["genre"] = "填空题" + elif "bracket" in p: + newproblem["genre"] = "选择题" + else: + newproblem["genre"] = "解答题" + thedict[pid] = newproblem + maxsim,argmaxsim = detectmaxsim(pid,[pid],thedict) + print("已收录题号: %s, 最接近题目: %s, 相似程度: %.3f, 题目类型: %s, 题目内容: %s"%(pid,argmaxsim,maxsim,newproblem["genre"],p)) + id += 1 + return 0 + + def CreateIDLinks(old_id_list,new_id_list,*thedict): #建立已有id和新id之间的联系, thedict为可选, 选中的话即为当前字典, 会从new_id_list中排除当前字典中有的项 if len(thedict) == 1 and type(thedict[0]) == dict: new_id_list = [id for id in new_id_list if not id in thedict[0]] @@ -239,9 +278,17 @@ def ImportRelatedProblems(new_json,main_json): # 导入编辑过的关联题目j pro_dict[id]["related"] += [new_id] pro_dict[new_id]["related"] += [id] print("导入关联题目 %s -> %s 信息成功."%(id,new_id)) - save_dict(dict(sorted(pro_dict.items())),main_json) #保存至目标pro_dict文件 + save_dict(SortDict(pro_dict),main_json) #保存至目标pro_dict文件 return 0 #正常返回0 +def strip_suffix(originalString, suf_words_list): # 字符串去除指定后缀 + for sw in suf_words_list: + output = re.sub(sw+r"[\S]*$","",originalString) + return(output) # 返回原字符串中截去suf_words_list及之后字符的部分 + +def get_striped_origin(pro_dict,id,suf_words_list): # 题目来源去除指定后缀 + return strip_suffix(pro_dict[id]["origin"],suf_words_list) # 返回去除指定后缀后的题目来源 + if __name__ == "__main__": print("数据库工具, import用.") \ No newline at end of file diff --git a/工具v2/寻找空闲题号.py b/工具v2/寻找空闲题号.py new file mode 100644 index 00000000..846a9c99 --- /dev/null +++ b/工具v2/寻找空闲题号.py @@ -0,0 +1,5 @@ +from database_tools import * + +pro_dict = load_dict(r"..\题库0.3\Problems.json") +print(spareIDs(pro_dict)) + diff --git a/工具v2/批量收录题目.py b/工具v2/批量收录题目.py new file mode 100644 index 00000000..8114b0e6 --- /dev/null +++ b/工具v2/批量收录题目.py @@ -0,0 +1,16 @@ +#修改起始id,出处,文件名 +starting_id = 18237 #起始id设置, 来自"寻找空闲题号"功能 +raworigin = "测试一下" #题目来源的前缀(中缀在.tex文件中) +filename = r"C:\Users\weiye\Documents\wwy sync\临时工作区\自拟题目16.tex" #题目的来源.tex文件 +editor = "王伟叶" #编辑者姓名 +IndexDescription = " " #设置是否使用后缀, 留空("")则不用后缀, 不留空则以所设字符串作为后缀起始词, 按.tex文件中的顺序编号 + + +from database_tools import * + +problems = GenerateProblemListFromString(ReadTextFile(filename)) +pro_dict = load_dict(r"../题库0.3/Problems.json") +AddProblemstoDict(starting_id,raworigin,problems,editor,IndexDescription,pro_dict) +save_dict(SortDict(pro_dict),r"../题库0.3/Problems.json") + +