mathdeptv2/工具v2/database_tools.py

import json,re,os,Levenshtein,fitz,time,sys

def GetDate(): #获得当前日期
    currentdate = str(time.localtime().tm_year)+str(time.localtime().tm_mon).zfill(2)+str(time.localtime().tm_mday).zfill(2)
    return currentdate #返回当前日期yyyymmdd

def ReadTextFile(filepath): #读取文本格式的文件
    with open(filepath,"r",encoding="u8") as f:
        data = f.read()
    return data #返回文本格式文件的内容

def SaveTextFile(data,filepath): #写入文本格式的文件
    with open(filepath,"w",encoding="u8") as f:
        f.write(data)
    return filepath #返回文件名

def SortDict(adict): #按字典项顺序排序字典
    return dict(sorted(adict.items())) #返回排序后的字典


#读取存储json数据库相关(不限于题号数据库)

def load_dict(filename): #根据filename读取json数据库并转化为python字典
    with open(filename,"r",encoding = "u8") as f:
        adict = json.loads(f.read())
    return adict #返回python字典

def save_dict(adict,filename): #将adict字典转化为json文件并保存至filename文件中
    try:
        with open(filename,"w",encoding = "u8") as f:
            f.write(json.dumps(adict,indent=4,ensure_ascii=False))
        return 0 #成功则返回0
    except:
        return 1 #不成功则返回1


def pre_treating(string): #删除字符串中对比较无用的字符, 以供比较
    string = re.sub(r"\\begin\{center\}[\s\S]*?\\end\{center\}","",string)
    string = re.sub(r"(bracket\{\d+\})|(blank\{\d+\})|(fourch)|(twoch)|(onech)","",string)
    string = re.sub(r"[\s\\\{\}\$\(\)\[\]]","",string)
    string = re.sub(r"[\n\t]","",string)
    string = re.sub(r"(displaystyle)|(overrightarrow)|(overline)","",string)
    string = re.sub(r"[,\.:;?]","",string)
    return string #返回处理后的字符串

def treat_dict(p_dict): #对整个题库字典中的内容部分进行预处理，删除无用字符
    treated_dict = {}
    for id in p_dict:
        treated_dict[id] = {}
        treated_dict[id]["content"] = pre_treating(p_dict[id]["content"])
        treated_dict[id]["same"] = p_dict[id]["same"]
    return treated_dict #返回处理后的字典, 含内容字段及相同题目字段

def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程度最大的题目（除外列表之外的部分）
    maxsim = -1
    argmaxsim = "000000"
    for id in adict:
        if not id in excludelist:
            simrate = Levenshtein.jaro(adict[id]["content"],adict[currentid]["content"])
            if simrate > maxsim:
                maxsim = simrate
                argmaxsim = id
    return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号

def generate_problem_series(startingid,length,adict): #在adict字典里返回从startingid开始的一系列题号, 每一题都是与上一题的关联程度最大的
    excludelist = [startingid]
    currentid = startingid
    for i in range(length):
        maxsim,currentid = detectmaxsim(currentid,excludelist,adict)
        excludelist.append(currentid)
    return ",".join(excludelist) #返回按顺序的题号列表


def generate_number_set(string,*thedict): #根据可能含有":"和","的题号字符串生成一个用逗号分隔的六位题号列表, 例如"1:3,5"会生成["000001","000002","000003","000005"]
#可变参数*dict如果存在, 将只生成dict的keys中包含的题号列表
    string = re.sub(r"[\n\s]","",string)
    string_list = string.split(",")
    numbers_list = []
    for s in string_list:
        if not ":" in s:
            numbers_list.append(s.zfill(6))
        else:
            start,end = s.split(":")
            for ind in range(int(start),int(end)+1):
                numbers_list.append(str(ind).zfill(6))
    if len(thedict) == 0:
        return numbers_list #返回六位题号列表
    elif len(thedict) == 1 and type(thedict[0]) == dict:
        numbers_list = [id for id in numbers_list if id in thedict[0]]
        return numbers_list #返回字典中存在的六位题号列表
    else:
        return "输入参数有误"

def generate_exp(id_list): #根据题号列表生成字符串式的含":"和","的题号字符串, 例如["000001","000002","000003","000005"]生成"000001:000003,000005", 若列表为空则生成"无有效题号"
    if not len(id_list) == 0:
        exp_list = []
        start = id_list[0]
        current = start
        end = start
        for id in id_list[1:]:
            # print(id,current)
            if int(id)-1 == int(current):
                current = id
                end = id
            else:
                if not start == end:
                    exp_list.append('"'+start+":"+end+'"')
                else:
                    exp_list.append('"'+start+'"')
                start = id
                current = id
                end = id
        if not start == end:
            exp_list.append('"'+start+":"+end+'"')
        else:
            exp_list.append('"'+start+'"')
        exp_str = ",".join(exp_list).replace('"',"")
    else:
        exp_str = "无有效题号"
    return exp_str #返回含有":"或","的题号字符串

def parsePDF(filePath): #提取pdf文件中的字符
    with fitz.open(filePath) as doc:
        text = ""
        for page in doc.pages():
            text += page.get_text() + "\n"
    return text

def extractIDs(filePath): #提取.txt,.tex或.pdf文件中的题号, 返回含有":"或","的题号字符串
    if filePath[-4:] == ".txt" or filePath[-4:] == ".tex":
        with open(filePath,"r",encoding = "u8") as f:
            data = f.read()
    elif filePath[-4:] == ".pdf":
        data = parsePDF(filePath)
    else:
        return "格式不正确"
    ids = re.findall(r"\((\d{6})\)",data)
    return generate_exp(ids)


def spareIDs(dictname): #返回空闲题号
    idlist = list(dictname.keys())
    used_str = generate_exp(idlist)
    used_list = used_str.split(",")
    output = ""
    for group in range(len(used_list)-1):
        output += "首个空闲id: %s, 直至: %s"%(str(int(used_list[group][-6:])+1).zfill(6),str(int(used_list[group+1][:6])-1).zfill(6)) + "\n"
    output += "首个空闲id: %s, 直至: %s"%(str(int(used_list[-1][-6:])+1).zfill(6),"999999")
    return output #返回的是一个多行的字符串, 每一行中含有一个空闲题号的闭区间

def parse_usage(datastring): #对单个usages中的项的结果进行分词
    datastring = re.sub(r"\s+","\t",datastring.strip())
    datalist = datastring.split("\t")
    date = ""
    classname = ""
    diff = []
    for item in datalist:
        if not "." in item and not "高" in item and not "班" in item:
            date = item
        elif "高" in item or "班" in item:
            classname = item
        else:
            diff.append(item)
    return({"date":date,"classname":classname,"difficulty":diff}) #返回一个字典, "date"表示日期, "classname"表示班级, "difficultiy"表示难度列表


def GenerateProblemListFromString(data): #从来自.tex文件的字符串生成题目列表, 每个item是一道题目, 新一行的%用作前缀
    try:
        data = re.findall(r"\\begin\{document\}([\s\S]*?)\\end\{document\}",data)[0]
    except:
        pass
    data = re.sub(r"\n{2,}","\n",data)
    data = re.sub(r"\\item",r"\\enditem\\item",data)
    data = re.sub(r"\\end\{enumerate\}",r"\\enditem",data) #切除无关信息, 保留关键信息
    problempositions = []
    for item in re.finditer(r"\\item([\s\S]*?)\\enditem",data):
        problempositions.append(item.regs[1]) #确定题目内容所在位置
    problem_list = []
    for pos in problempositions:
        content = data[pos[0]:pos[1]].strip()
        content = re.sub(r"\n\%[\s\S]*$","",content) #题目内容
        subdata = data[:pos[0]] #开始寻找出处中缀
        suflist = re.findall(r"\n(\%\s{0,}[\S]+)\n",subdata)
        if len(suflist) == 0:
            suffix = ""
        else:
            suffix = suflist[-1].replace("%","").strip()
        problem_list.append((content,suffix))
    return problem_list #返回一个列表, 每一项是一个由 题目内容 和 题目来源前缀 组成的元组


def CreateEmptyProblem(problem): # 根据已有的题目创建新的空题目
    NewProblem = problem.copy()
    for field in NewProblem:
        if type(NewProblem[field]) == str:
            NewProblem[field] = ""
        elif type(NewProblem[field]) == list:
            NewProblem[field] = []
        elif type(NewProblem[field]) == int or type(NewProblem[field]) == float:
            NewProblem[field] = -1
    return NewProblem #返回一个空题目的字典, ID和内容待赋值

def CreateNewProblem(id,content,origin,dict,editor): # 构建一道新题目的字典
    NewProblem = CreateEmptyProblem(dict["000001"])
    NewProblem["id"] = str(id).zfill(6)
    NewProblem["content"] = content
    NewProblem["origin"] = origin
    NewProblem["edit"] = [editor]
    return NewProblem # 返回一道新题目的字典, 已赋新的ID, 内容, 来源和编辑者


def AddProblemstoDict(startingid,raworigin,problems,editor,indexdescription,thedict): #将来自GenerateProblemListFromString的列表中的题目添加到thedict字典
    id = int(startingid)
    currentsuffix = problems[0][1]
    problemindex = 0
    for p_and_suffix in problems:
        p, suffix = p_and_suffix
        pid = str(id).zfill(6)
        if pid in thedict:
            print("ID %s 已被使用."%pid)
            return 1
        else:
            if suffix == currentsuffix:
                problemindex += 1
            else:
                problemindex = 1
            origin = raworigin + suffix + indexdescription.strip() + ("" if indexdescription.strip() == ""  else str(problemindex))
            newproblem = CreateNewProblem(pid,p.strip(),origin,thedict,GetDate() + "\t" + editor)
            if "blank" in p:
                newproblem["genre"] = "填空题"
                newproblem["space"] = ""
            elif "bracket" in p:
                newproblem["genre"] = "选择题"
                newproblem["space"] = ""
            else:
                newproblem["genre"] = "解答题"
                newproblem["space"] = "4em"
            thedict[pid] = newproblem
            maxsim,argmaxsim = detectmaxsim(pid,[pid],thedict)
            print("已收录题号: %s, 最接近题目: %s, 相似程度: %.3f, 题目类型: %s, 题目来源: %s, 题目内容: %s"%(pid,argmaxsim,maxsim,newproblem["genre"],origin,p))
            id += 1
    return 0


def CreateIDLinks(old_id_list,new_id_list,*thedict): #建立已有id和新id之间的联系, thedict为可选, 选中的话即为当前字典, 会从new_id_list中排除当前字典中有的项
    if len(thedict) == 1 and type(thedict[0]) == dict:
        new_id_list = [id for id in new_id_list if not id in thedict[0]]
    if len(old_id_list)>len(new_id_list):
        return "新ID个数不足."
    else:
        id_links = []
        for i in range(len(old_id_list)):
            id_links.append((old_id_list[i],new_id_list[i]))
        return id_links # 返回id联系, 每个元组表示一对id, 前者是旧id, 后者是新id


def CreateRelatedProblems(links,thedict,filepath,editor): # 根据links关联生成待编辑的新题目字典, 等待编辑修改
    try:
        new_dict = {}
        for item in links:
            old_id,new_id = item
            new_dict[old_id] = thedict[old_id].copy()
            new_dict[old_id]["id"] = new_id + "待替换"
            new_dict[old_id]["content"] = "(待编辑)" + new_dict[old_id]["content"]
            new_dict[old_id]["usages"] = []
            new_dict[old_id]["same"] = []
            new_dict[old_id]["unrelated"] = []
            new_dict[old_id]["edit"] = new_dict[old_id]["edit"].copy() + [GetDate()+"\t"+editor]
            new_dict[old_id]["origin"] += "-" + GetDate() + "修改"
        save_dict(new_dict,filepath)
    except:
        return 1 #异常返回1
    return 0  #正常返回0

def ImportRelatedProblems(new_json,main_json): # 导入编辑过的关联题目json文件到主数据库
    pro_dict = load_dict(main_json)
    new_dict = load_dict(new_json)
    for id in new_dict:
        new_id = new_dict[id]["id"].replace("待替换","") #新题号后需要跟"待替换"字样
        if new_id in pro_dict:
            print("题号有重复")
            return 1
        else:
            pro_dict[new_id] = new_dict[id].copy()
            pro_dict[new_id]["id"] = new_id
            pro_dict[id]["related"] += [new_id]
            pro_dict[new_id]["related"] += [id]
            p = pro_dict[new_id]["content"]
            if "blank" in p:
                pro_dict[new_id]["genre"] = "填空题"
                pro_dict[new_id]["genre"] = ""
            elif "bracket" in p:
                pro_dict[new_id]["genre"] = "选择题"
                pro_dict[new_id]["genre"] = ""
            else:
                pro_dict[new_id]["genre"] = "解答题"
                pro_dict[new_id]["genre"] = "4em"
            print("导入关联题目 %s -> %s 信息成功."%(id,new_id))
    save_dict(SortDict(pro_dict),main_json) #保存至目标pro_dict文件
    return 0 #正常返回0


def strip_suffix(originalString, suf_words_list): # 字符串去除指定后缀
    for sw in suf_words_list:
        output = re.sub(sw+r"[\S]*$","",originalString)
    return(output) # 返回原字符串中截去suf_words_list及之后字符的部分

def get_striped_origin(pro_dict,id,suf_words_list): # 题目来源去除指定后缀
    return strip_suffix(pro_dict[id]["origin"],suf_words_list) # 返回去除指定后缀后的题目来源


def SeperateFirstLine(string): # 切割一个含有换行的字符串
    thelist = string.split("\n")
    firstline = thelist[0].strip()
    contents = "\n".join([lines.strip() for lines in thelist[1:]])
    return (firstline,contents) # 返回第一行, 其余行形成的二元组, 每一行前后的空格都被删去

def ObtainDatatoModify(metadatafilepath,fields_dict): #从metadata文件中获取需要修改的综合信息, metadata文件用双回车区分项, 单行的项表示此后的字段, 不小于两行的项中第一行是题目id, 第二行起是要修改的内容.
#fieldsdictpath是字段信息数据库文件路径
    data = ReadTextFile(metadatafilepath)
    datalines = data.split("\n")
    data = ""
    for l in datalines:
        if l.strip() in fields_dict:
            data += l + "\n\n"
        else:
            data += l + "\n"
    data = re.sub(r"\n[\s\n]*\n","\n\n",data.strip()) #去除一些无意义的空格和多余的回车
    datalist = data.split("\n\n")
    to_modify_list = []
    currentfield = "NotAField"
    for line in datalist:
        if line.strip() in fields_dict:
            currentfield = line.strip()
        elif not "\n" in line:
            currentfield = "NotAField"
        else:
            id,content = SeperateFirstLine(line)
            to_modify_list.append((currentfield,str(id).zfill(6),content))
    return to_modify_list #返回一个列表, 每一项是一个三元组, 三项依次为字段, 题号, 要修改的内容

def FloatToInt(string): #从字符串返回浮点数，如果值非常接近整数则返回该整数
    f = float(string)
    if abs(f-round(f))<0.01:
        f = round(f)
    return f #返回浮点数或整数

def OverwriteData(prodict,fieldsdict,field_id_and_content): #用覆盖方式修改题目某字段信息
    field,id,content = field_id_and_content
    fieldType = fieldsdict[field]["FieldType"]
    if not id in prodict:
        print("题号 %s 并不在数据库中"%id)
    else:
        if fieldType == "str":
            prodict[id][field] = content
        elif fieldType == "int" or fieldType == "float":
            content = FloatToInt(content)
            prodict[id][field] = content
        print("已覆盖 %s 的 %s 字段, 内容为 %s"%(id,field,content))
    return (id,field,content) #返回三元组: 题号, 字段, 覆盖内容

def AppendData(prodict,fieldsdict,field_id_and_content): #用添加方式修改题目某字段信息
    field,id,content = field_id_and_content
    fieldType = fieldsdict[field]["FieldType"]
    if not id in prodict:
        print("题号 %s 并不在数据库中"%id)
    else:
        if fieldType == "str":
            if content.strip() in prodict[id][field]:
                print("题号 %s 的 %s 字段, 内容 %s 已存在"%(id,field,content))
            else:
                prodict[id][field] = (prodict[id][field].strip() + "\n" + content).strip()
                print("已于 %s 的 %s 字段执行添加, 内容为 %s"%(id,field,content))
        elif fieldType == "list":
            lines = [line.strip() for line in content.split("\n")]
            for line in lines:
                if line in prodict[id][field]:
                    print("题号 %s 的 %s 字段, 内容 %s 已存在"%(id,field,line))
                else:
                    prodict[id][field] = prodict[id][field].copy() + [line]
                    print("已于 %s 的 %s 字段执行添加, 内容为 %s"%(id,field,line))
    return (id,field,content) #返回三元组: 题号, 字段, 内容

def AppendMutualData(prodict,field_id_and_content): #添加两个id之间的same, related, unrelated关联
    field,id,content = field_id_and_content
    lines = [str(line).zfill(6) for line in content.split("\n")]
    for id2 in lines:
        if not id in prodict:
            print("题号 %s 并不在数据库中"%id)
        elif not id2 in prodict:
            print("题号 %s 并不在数据库中"%id2)
        else:
            if id2 in prodict[id][field]:
                print("题号 %s 的 %s 字段, 内容 %s 已存在"%(id,field,id2))
            else:
                prodict[id][field] = prodict[id][field] + [id2]
                print("已于 %s 的 %s 字段执行添加, 内容为 %s"%(id,field,id2))
            if id in prodict[id2][field]:
                print("题号 %s 的 %s 字段, 内容 %s 已存在"%(id2,field,id))
            else:
                prodict[id2][field] = prodict[id2][field] + [id]
                print("已于 %s 的 %s 字段执行添加, 内容为 %s"%(id2,field,id))
    return (id,field,content) #返回三元组: 题号, 字段, 内容

def AppendObjData(prodict,objdict,field_id_and_content): #添加目标编号数据
    field,id,content = field_id_and_content
    if not id in prodict:
        print("题号 %s 并不在数据库中"%id)
    else:
        lines = [line.strip() for line in content.split("\n")]
        for objid in lines:
            if not objid in objdict:
                print("目标编号 %s 并不在数据库中"%objid)
            elif objid in prodict[id][field]:
                print("题号 %s 的 %s 字段, 内容 %s 已存在"%(id,field,objid))
            else:
                prodict[id][field] = prodict[id][field] + [objid]
                print("已于 %s 的 %s 字段执行添加, 编号为 %s, 内容为 %s"%(id,field,objid,objdict[objid]["content"]))
    return (id,field,content) #返回三元组: 题号, 字段, 内容

def AppendUsageData(prodict,field_id_and_content): #添加使用记录数据
    field,id,content = field_id_and_content
    lines = [re.sub(r"\s+",r"\t",line.strip()) for line in content.split("\n")]
    pending_list = []
    for line in lines:
        if not "FORCE" in line.upper():
            time_stripped = re.sub(r"^\d{4,}\t","",line) #读取去除时间的数据
            if time_stripped in "\n".join(prodict[id][field]):
                print("题号 %s 的 %s 字段, 内容 %s 已存在"%(id,field,line))
            else:
                classid = [info for info in line.split("\t") if len(re.findall(r"[班高一二三]",info))>0][0] # 读取班级号
                if classid in "\n".join(prodict[id][field]):
                    print("班级 %s 在题号为 %s 的题目处已有使用记录, 在pending list中记录"%(classid,id))
                    oldinfo = [usage for usage in prodict[id][field] if classid in usage]
                    pending_list = pending_list + [(id,line,oldinfo)]
                else:
                    prodict[id][field].append(line)
                    print("已于 %s 的 %s 字段执行添加, 内容为 %s"%(id,field,line))
        else:
            line = re.sub(r"\s+",r"\t",re.sub(r"FORCE","",line.upper())).strip()
            prodict[id][field].append(line)
            print("已于 %s 的 %s 字段执行强制添加, 内容为 %s"%(id,field,line))
    output = "usages\n\n"
    for item in pending_list:
        id,new,oldlist = item
        output += id + "\n"
        output += new+"\tFORCE\n"
        output += "\n".join(oldlist)+"\n\n"
    return (field,id,content,output) #返回四元组: 题号, 字段, 内容, 待确定是否要添加的字符串（不含FORCE字样的行为旧结果，含FORCE字样的行为新结果，FORCE是运行后强制添加）

def ImportMetadata(prodict,objdict,fieldsdict,metadatafilepath,pendingdatafilepath): #metadata自动修改, 根据字段自适应修改, 参数为题库字典, 目标字典, 字段字典, metadata文本文件路径, 待确定是否替换的内容的存放路径
    data_to_modify = ObtainDatatoModify(metadatafilepath,fieldsdict)
    for item in data_to_modify:
        field = item[0]
        if field == "NotAField":
            print("字段名有误")
            return 1
        else:
            method = fieldsdict[field]["Method"]
            if method == "overwrite":
                feedback = OverwriteData(prodict,fieldsdict,item)
            elif method == "append":
                feedback = AppendData(prodict,fieldsdict,item)
            elif method == "mutualappend":
                feedback = AppendMutualData(prodict,item)
            elif method == "objappend":
                feedback = AppendObjData(prodict,objdict,item)
            elif method == "usageappend":
                feedback = AppendUsageData(prodict,item)
                outputstring = feedback[3]
                SaveTextFile(outputstring,pendingdatafilepath)
            elif method == "fixed":
                print("字段 %s 不可按此方式修改"%field)
    return feedback # 已在数据库中修改, 之后需要将数据库写入一次, 返回1表示字段名有误, 返回其他表示成功进行了修改

def parseUsage(usagestring): #对单行usage信息进行分词
    usagedict = {}
    datalist = re.sub(r"\s+",r"\t",usagestring.strip()).split("\t")
    if re.findall(r"20[\d]{2,6}",datalist[0]) != [] and re.findall(r"[\u4e00-\u9fa5]",datalist[0]) == []:
        date = datalist.pop(0)
    else:
        date = ""
    usagedict["date"] = date
    if re.findall(r"[高一二三班]",datalist[0]) != []:
        classid = datalist.pop(0)
    else:
        classid = ""
    usagedict["classid"] = classid
    usagedict["subproblems"] = len(datalist)
    usagedict["difficulties"] = [float(u) for u in datalist]
    usagedict["glossdiff"] = round(sum(usagedict["difficulties"])/usagedict["subproblems"],3)
    return usagedict #返回词典, 含date日期, classid班级标识, subproblems小题数目, difficulties得分率, glossdiff小题平均得分率

def StringSubstitute(regex,template,stringtuple): # 用stringtuple里的字符串逐个替换template中的符合某种regex规则的字符串
    toSub = re.findall(regex,template)
    if not len(toSub) == len(stringtuple):
        print("长度不符.")
        return 1 #若长度不符则显示“长度不符”并返回1
    else:
        output = template
        for i in range(len(toSub)):
            output = output.replace(toSub[i],stringtuple[i])
        return output #若长度符合则返回替换后的字符串

def GenerateTexDataforEditing(id_string,prodict,templatefilepath,editor): # 根据id_string的题号列表在prodict中生成一个可修改的tex文件, 包含题目内容，答案和解答，editor表示编辑者
    id_list = generate_number_set(id_string,prodict)
    output = "编辑者: " + editor + "\n\n\\begin{enumerate}\n\n"
    for id in id_list:
        content = prodict[id]["content"]
        answer = prodict[id]["ans"]
        solution = prodict[id]["solution"]
        remark = prodict[id]["remark"]
        output += "\\item (%s) "%str(id).zfill(6) + content + "\n\n" + "答案: " + answer + "\n\n" + "解答与提示: " + solution + "\n\n" + "备注: " + remark + "\n\n"
    output += "\\end{enumerate}"
    template = ReadTextFile(templatefilepath)
    texdata = StringSubstitute(r"<<[\s\S]*?待替换[\s\S]*?>>",template,[output])
    return texdata # 返回Tex文件的字符串, 待保存至tex文件


def GetEditedProblems(string): #从.tex文件所读取的字符串中取出正文内容, 并切割成以题号为key的字典, 内容为[题目内容, 答案, 解答与提示, 备注]四元数组
    bodydata = re.findall(r"\\begin\{enumerate\}([\s\S]*)\\end\{enumerate\}",string)[0]+r"\item"
    problems = re.findall(r"\((\d{6})\)([\s\S]*?)\\item",bodydata)
    edited_dict = {}
    for problem in problems:
        id, pro_string = problem
        edited_dict[id] = parseProblem(pro_string)
    return edited_dict # 返回以题号为key, 内容为[题目内容, 答案, 解答与提示, 备注]四元数组的字典

def parseProblem(string): # 对以不小于两个回车切分的四段式字符串进行分词(通常第二段有"答案:", 第三段有"解答与提示:", 第四段有"备注:")
    data = string.strip()
    data = re.sub(r"\n{2,}","\n\n",data)
    content,ans,solution,remark = data.split("\n\n")
    content = content.strip()
    ans = re.sub("答案:","",ans).strip()
    solution = re.sub("解答与提示:","",solution).strip()
    remark = re.sub("备注:","",remark).strip()
    return (content,ans,solution,remark) # 返回四元组(题目内容, 答案, 解答与提示, 备注)

def ModifyProblembyTeX(id_string,prodict,toeditfilepath,editor): # vscode打开.tex文件修改题目的内容, 答案与解答

    # 读取题号列表并生成待编辑的.tex文件
    texdata = GenerateTexDataforEditing(id_string,prodict,"模板文件/题目编辑.txt",editor)
    SaveTextFile(texdata,toeditfilepath)

    # 打开待编辑的.tex文件
    print("编辑完成后保存文件, 关闭文件继续...")
    os.system("code -w -g "+toeditfilepath)

    # 生成修改后的题号与内容, 答案, 解答与提示的对应
    editor = GetDate() + "\t" + editor
    editedtexdata = ReadTextFile(toeditfilepath)
    edited_dict = GetEditedProblems(editedtexdata)
    editedIDList = []

    # 将.tex文件中的修改反映到原题库字典, 并保存
    for id in edited_dict:
        content, ans, solution, remark = edited_dict[id]
        if not (content == prodict[id]["content"] and ans == prodict[id]["ans"] and solution == prodict[id]["solution"] and remark == prodict[id]["remark"]):
            prodict[id]["content"] = content
            prodict[id]["ans"] = ans
            prodict[id]["solution"] = solution
            prodict[id]["remark"] = remark
            prodict[id]["edit"] = prodict[id]["edit"] + [editor]
            editedIDList.append(id)

    save_dict(prodict,"../题库0.3/problems.json")
    return generate_exp(editedIDList) # 返回编辑过的字典


def RemoveMutualLink(metadata,prodict): # 删除双向关联id(same,related,unrelated), 输入为一个字符串, 每行表示一对题号, 用空格或制表符隔开
    lines = [re.sub(r"[\s]+",r"\t",line).strip() for line in metadata.split("\n")]
    for line in lines:
        if not line == []:
            id1,id2 = line.split("\t")
            id1 = id1.zfill(6)
            id2 = id2.zfill(6)
            for field in ["same","related","unrelated"]:
                if id1 in prodict[id2][field]:
                    prodict[id2][field].remove(id1)
                if id2 in prodict[id1][field]:
                    prodict[id1][field].remove(id2)
    return 0 # 返回0

def jsonEditProblemMetadata(id_string,prodict,editor): #用vscode在json模式下编辑题目综合信息
    jsontoeditpath = "临时文件/problem_edit.json"
    idlist = generate_number_set(id_string,prodict)
    edit_dict = {}
    for id in idlist:
        edit_dict[id] = prodict[id].copy()
    save_dict(edit_dict,jsontoeditpath)
    #打开待编辑的json文件
    os.system("code -w -g "+jsontoeditpath)
    #编辑后关闭文件窗口自动执行下面步骤
    editeddict = load_dict(jsontoeditpath)
    editlist = []
    for id in editeddict:
        if not prodict[id] == editeddict[id]:
            prodict[id] = editeddict[id].copy()
            prodict[id]["edit"] = prodict[id]["edit"] + [GetDate() + "\t" + editor]
            editlist.append(id)
    return(generate_exp(editlist)) # 返回编辑过的题号字符串

def GetSamePairs(prodict): #获取已标注的相同题目组
    same_groups = []
    for id in prodict:
        same = prodict[id]["same"]
        if len(same) > 0 and not len(prodict[id]["usages"]) == 0:
            same_groups.append([id]+same)
    return(same_groups) #返回相同题目组, 每组由一些相同的题目构成

def ShareSameUsages(prodict,same_group): #对same_group中的所有题号共享使用记录
    current_usages = []
    for id in same_group:
        for usage in prodict[id]["usages"]:
            if not usage in current_usages:
                current_usages = current_usages + [usage]
    for id in same_group:
        prodict[id]["usages"] = current_usages.copy()
    return((same_group,current_usages)) #返回same_group中的题号列表 及 所有题号记录 组成的二元组

def SortUsages(prodict): #对使用记录按字符串进行排序
    for id in prodict:
        usages = prodict[id]["usages"].copy()
        usages.sort()
        prodict[id]["usages"] = usages.copy()
    return 0 #返回0


def SortUsagesbyAverage(theusages): #根据使用记录每一条的平均值进行排序, 越高的在越前面
    glossdifflist = []
    for i in range(len(theusages)):
        glossdifflist = glossdifflist + [(i,parseUsage(theusages[i])["glossdiff"])]
    sortedglossdifflist = sorted(glossdifflist, key = lambda x:x[1], reverse = True)
    newusages = [theusages[i[0]] for i in sortedglossdifflist]
    return(newusages) # 返回排序后的list


def StripSuffix(string, suf_words): #除去字符串前后的空格及suf_words中每一个符合regex的字符串及其之后的部分
    string = string.strip()
    for sw in suf_words:
        string = re.sub(sw+r"[\S]*$","",string)
    return(string) # 返回处理以后的字符串

def MatchCondition(problem,condition_dict): #判断problem这一字典是否符合condition_dict中的所有筛选条件
    match = True #初始设定符合条件
    for fieldraw in [c for c in condition_dict if not "_not" in c and not condition_dict[c] == [""]]: #选出正向的条件([""]表示该条件不起作用)
        cond_list = condition_dict[fieldraw]
        if type(cond_list) == str:
            cond_list = [cond_list]
        field = re.sub(r"\d","",fieldraw)
        if type(problem[field]) == list: #将题库字典中的相应字段转化成字符串string
            string = "\n".join((problem[field]))
        else:
            string = str(problem[field])
        current_match = False
        for cond in cond_list: #有一个条件被满足, 就认为当前字段满足条件了
            if len(re.findall(cond,string)) > 0:
                current_match = True
        if current_match == False:
            match = False # 如果某个正向条件未满足, 那么match就赋值为False; 如果每一个正向条件都被满足, 那么match仍为True
    for fieldraw in [c for c in condition_dict if "_not" in c and not condition_dict[c] == [""]]: #选出反向的条件([""]表示该条件不起作用)
        cond_list = condition_dict[fieldraw]
        fieldraw = fieldraw.replace("_not","")
        field = re.sub(r"\d","",fieldraw)
        if type(problem[field]) == list: #将题库字典中的相应字段转化成字符串string
            string = "\n".join((problem[field]))
        else:
            string = str(problem[field])
        current_match = True
        for cond in cond_list: #有一个条件被满足, 就认为当前字段不被满足了
            if len(re.findall(cond,string)) > 0:
                current_match = False
        if current_match == False:
            match = False
    return match #返回是否符合条件


def get_color(value): # 根据得分率获得rgb颜色代码
    value = float(value)
    if value>=0.5:
        (r,g)=(1,2-2*value)
    else:
        (r,g)=(2*value,1)
    return "{%.3f,%3f,0}"%(r,g) #返回用大括号包围的rgb数值

def GenerateValueColorCode(matchobj): # 生成三位小数代表的颜色方框(黑色边框, 难度对应的底色)的LaTeX代码
    value = matchobj.group(1)
    return "\t\\fcolorbox[rgb]{0,0,0}%s{%s}"%(get_color(value),value) #返回代码

def StudentsGetAfterContent(id,prodict,answered,spaceflag): #生成学生版讲义后的答案及空格， answered表示有无答案, spaceflag表示有无空格
    string = ""
    if answered:
        string += "答案: \\textcolor{red}{%s}\n\n"%(prodict[id]["ans"] if prodict[id]["ans"] != "" else "暂无答案")
    if spaceflag:
        if prodict[id]["space"] != "":
            string += "\\vspace*{%s}\n\n"%prodict[id]["space"]
    return string #生成学生讲义后的答案及空格

def XeLaTeXCompile(filedir,filename): #在filedir目录中用XeLaTeX编译filename文件两次（能编译出正确的引用和页码）
    flagsuc = True
    for i in range(2):
        if os.system("xelatex -interaction=batchmode -output-directory=%s %s"%(filedir,filename)) == 0:
            print("第%d次编译成功."%(i+1))
        else:
            flagsuc = False
    return flagsuc # 若第二次编译成功则返回True, 否则返回False

def GenerateStudentsBodyString(problems,sectiontitles,pro_dict,consecutivenumbering,answered,spaceflag): #生成学生版的.tex文件的主体内容
    bodystring = ""
    if len(problems) == len(sectiontitles):
        count = 0
        for i in range(len(problems)):
            idlist = generate_number_set(problems[i],pro_dict)
            sectionstring = "\\section{%s}\n\\begin{enumerate}\n\\setcounter{enumi}{%d}\n\n"%(sectiontitles[i],count if consecutivenumbering else 0)
            for id in idlist:
                count += 1
                aftercontent = StudentsGetAfterContent(id,pro_dict,answered,spaceflag)
                sectionstring += "\\item {\\tiny(%s)} %s\n\n%s"%(id,pro_dict[id]["content"],aftercontent)
            sectionstring += "\\end{enumerate}"
            bodystring += sectionstring
    else:
        idstring = ",".join(problems)
        idlist = generate_number_set(idstring,pro_dict)
        sectionstring = "\\begin{enumerate}\n\n"
        for id in idlist:
            aftercontent = StudentsGetAfterContent(id,pro_dict,answered,spaceflag)
            sectionstring += "\\item {\\tiny(%s)} %s\n\n%s"%(id,pro_dict[id]["content"],aftercontent)
        sectionstring += "\\end{enumerate}"
        bodystring += sectionstring
    return bodystring #返回主题内容字符串

if __name__ == "__main__":
    print("数据库工具, import用.")