import zipfile,os,re import pandas as pd from pathlib import Path #设置工作目录, 要求工作目录中恰有一个.txt文件(或.tex文件)和一些.zip文件,其余不论 # 第一行用"#"开头的作业数据不会被读取 filepath = r"C:\Users\weiye\Documents\wwy sync\xiaoxian待导入" #设置届别与接受的比例阈值 semester = 2023 threshold = 0.5 #当班级提交人数超过该比例时数据有效 def stringcount(string,list): theitem = "" count = 0 for item in list: if string in item: count += 1 theitem = item return (count,theitem) shiftdict = {"高一": 3, "高二": 2, "高三": 1} #生成文件名tex_file和zip_file files = [os.path.join(filepath,f) for f in os.listdir(filepath)] tex_file = [f for f in files if ".tex" in f or ".txt" in f][0] zip_files = [f for f in files if ".zip" in f] #分割各次作业数据 with open(tex_file,"r",encoding = "utf8") as f: tex_data = f.read().strip() tex_data = re.sub(r"\t+",r" ",tex_data) tex_data = re.sub(r"\n{2,}","---split---",tex_data) homeworklist = tex_data.split("---split---") #读取各次作业首行(文件名)与次行(日期)并组织字典结构 homeworkdict = {} for hwk in homeworklist: hwkdata = hwk.strip().split("\n") id = hwkdata.pop(0).replace(" ","") date = hwkdata.pop(0) if not id.startswith("#"): homeworkdict[id] = {} homeworkdict[id]["date"] = date homeworkdict[id]["usage_data"] = hwkdata #测试是否每一项都有相应的zip文件与之对应 execflag = True for id in homeworkdict: if stringcount(id,zip_files)[0] == 1: print("zip文件在文件夹中:",id) else: execflag = False print("!!!zip文件个数不对:",id) if execflag: outputstr = "usages\n\n" for id in homeworkdict: print("正在处理%s"%id) date = homeworkdict[id]["date"] #在zip文件中找到包含正确率数据的文件 zip_file = os.path.join(filepath,stringcount(id,zip_files)[1]) zf = zipfile.ZipFile(zip_file) statfiles = [f.filename for f in zf.filelist if "试题分析" in f.filename] handinfiles = [f.filename for f in zf.filelist if "学生成绩" in f.filename] #生成答题纸区域编号与题目ID的对应 correspondence_dict = {} if "statsfile.xlsx" in os.listdir("临时文件"): os.remove("临时文件/statsfile.xlsx") extractedpath = Path(zf.extract(statfiles[0])) extractedpath.rename("临时文件/statsfile.xlsx") df = pd.read_excel("临时文件/statsfile.xlsx") problems_indexes = list(df[df.columns[0]][2:]) for pind in problems_indexes: for p in homeworkdict[id]["usage_data"]: if len(re.findall("^"+pind+"\s",p))>0: correspondence_dict[pind] = p[p.index(" ")+1:] for sf in [f for f in statfiles if re.findall(r"\d",f) != []]: #读取文件生成区域列表与难度列表 gradename = re.findall(r"高[一二三]",sf)[0] classname = str(semester+shiftdict[gradename])+"届"+gradename+re.findall(r"高[一二三]([\d]*?)班",sf)[0].zfill(2)+"班" # print(classname) if "statsfile.xlsx" in os.listdir("临时文件"): os.remove("临时文件/statsfile.xlsx") extractedpath = Path(zf.extract(sf)) extractedpath.rename("临时文件/statsfile.xlsx") df = pd.read_excel("临时文件/statsfile.xlsx") if "handinfile.xlsx" in os.listdir("临时文件"): os.remove("临时文件/handinfile.xlsx") extractedpath = Path(zf.extract(sf[:sf.find("数学")]+"学生成绩.xlsx")) extractedpath.rename("临时文件/handinfile.xlsx") handindf = pd.read_excel("临时文件/handinfile.xlsx") if str(handindf.iloc[int(len(handindf)*threshold)]["Unnamed: 4"]) != "缺考": difficulties = list(df[df.columns[9]][2:]) problems_indexes = list(df[df.columns[0]][2:]) #生成该班级题目ID对应难度列表组成的字典 class_difficulty = {} for i in range(len(difficulties)): if problems_indexes[i] in correspondence_dict: if not correspondence_dict[problems_indexes[i]] in class_difficulty: class_difficulty[correspondence_dict[problems_indexes[i]]] = [difficulties[i]] else: class_difficulty[correspondence_dict[problems_indexes[i]]].append(difficulties[i]) #添加到输出字符串中 for id in class_difficulty: outputstr += id + "\n" + date + "\t" + classname + "\t" + "\t".join([("%.3f" %float(v)) for v in class_difficulty[id]]) + "\n\n" print(classname, "有效") else: print(classname, "无效") with open("临时文件/自动转换结果.txt","w",encoding = "utf8") as f: f.write(outputstr) with open("文本文件/metadata.txt","w",encoding = "utf8") as f: f.write(outputstr) zf.close()