import zipfile,os,re import pandas as pd from pathlib import Path #设置工作目录, 要求工作目录中恰有一个.txt文件(或.tex文件)和一些.zip文件,其余不论 # 第一行用"#"开头的作业数据不会被读取 filepath = r"C:\Users\weiye\Documents\wwy sync\xiaoxian待导入" #设置届别与接受的比例阈值 semester = 2023 # threshold = 0.5 #当班级提交人数超过该比例时数据有效 def getindex(string,pos = 2): para = string.split(".") return int(para[pos-1]) def stringcount(string,list): theitem = "" count = 0 for item in list: if string in item: count += 1 theitem = item return (count,theitem) shiftdict = {"高一": 3, "高二": 2, "高三": 1} patterns = [r"填空([\d\.]+)\((\d+)\)",r"单选([\d\.]+)\((\d+)\)",r"^([\d\.]+)第\d+(步)"] #生成文件名tex_file和zip_file files = [os.path.join(filepath,f) for f in os.listdir(filepath)] tex_file = [f for f in files if ".tex" in f or ".txt" in f][0] zip_files = [f for f in files if ".zip" in f] #分割各次作业数据 with open(tex_file,"r",encoding = "utf8") as f: tex_data = f.read().strip() tex_data = re.sub(r"\t+",r" ",tex_data) tex_data = re.sub(r"\n{2,}","---split---",tex_data) homeworklist = tex_data.split("---split---") #读取各次作业首行(文件名)与次行(日期)并组织字典结构 homeworkdict = {} for hwk in homeworklist: hwkdata = hwk.strip().split("\n") id = hwkdata.pop(0).replace(" ","") date = hwkdata.pop(0) if not id.startswith("#"): homeworkdict[id] = {} homeworkdict[id]["date"] = date homeworkdict[id]["usage_data"] = hwkdata #测试是否每一项都有相应的zip文件与之对应 execflag = True for id in homeworkdict: if stringcount(id,zip_files)[0] == 1: print("zip文件在文件夹中:",id) else: execflag = False print("!!!zip文件个数不对:",id) if execflag: outputstr = "usages\n\n" for hid in homeworkdict: print("正在处理%s"%id) date = homeworkdict[hid]["date"] #在zip文件中找到包含正确率数据的文件 zip_file = os.path.join(filepath,stringcount(hid,zip_files)[1]) zf = zipfile.ZipFile(zip_file) # statfiles = [f.filename for f in zf.filelist if "试题分析" in f.filename] handinfiles = [f.filename for f in zf.filelist if "小题分_按学号" in f.filename] if "statsfile.xlsx" in os.listdir("临时文件"): os.remove("临时文件/statsfile.xlsx") extractedpath = Path(zf.extract(handinfiles[0])) extractedpath.rename("临时文件/statsfile.xlsx") df = pd.read_excel("临时文件/statsfile.xlsx",skiprows=2) indices = {} for col in df.columns: for pattern in patterns: res = re.findall(pattern,col) if len(res) > 0: id,mark = res[0] if not id in indices: indices[id] = {} if not "步" in mark: indices[id][col] = int(mark) else: indices[id][col] = int(input(f"{hid}-{col}的满分:")) corresp_dict = {} homework = homeworkdict[hid] data = homework["date"] for rawline in homework["usage_data"]: line = re.sub(r"[\t\s]+"," ",rawline) a,b = line.split(" ") if a.strip() in indices: corresp_dict[b.strip()]=indices[a].copy() # print(corresp_dict) for excelfile in handinfiles: if "statsfile.xlsx" in os.listdir("临时文件"): os.remove("临时文件/statsfile.xlsx") extractedpath = Path(zf.extract(excelfile)) extractedpath.rename("临时文件/statsfile.xlsx") df = pd.read_excel("临时文件/statsfile.xlsx",skiprows=2)[:-2] gradename = re.findall(r"高[一二三]",excelfile)[0] classname = str(semester+shiftdict[gradename])+"届"+gradename+re.findall(r"高[一二三]([\d]*?)班",excelfile)[0].zfill(2)+"班" for id in corresp_dict: colandmarks = corresp_dict[id] currentstring = f"{id}\n{date}\t{classname}" for col in colandmarks: mark = colandmarks[col] diff = df[col].mean()/mark currentstring += f"\t{diff:.3f}" currentstring += "\n\n" outputstr += currentstring with open("临时文件/自动转换结果.txt","w",encoding = "utf8") as f: f.write(outputstr) with open("文本文件/metadata.txt","w",encoding = "utf8") as f: f.write(outputstr) zf.close()