From ac1590b26e53f63ca6f0c67459b33f3839456d7d Mon Sep 17 00:00:00 2001 From: "weiye.wang" Date: Sat, 8 Jul 2023 09:11:00 +0800 Subject: [PATCH] =?UTF-8?q?database=5Ftools=E4=B8=AD=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E5=AF=BB=E6=89=BE=E5=85=B3=E8=81=94=E9=A2=98=E5=9D=97=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 工具v2/database_tools.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/工具v2/database_tools.py b/工具v2/database_tools.py index 5d1faac0..5b373295 100644 --- a/工具v2/database_tools.py +++ b/工具v2/database_tools.py @@ -60,14 +60,34 @@ def treat_dict(p_dict): #对整个题库字典中的内容部分进行预处理 def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程度最大的题目(除外列表之外的部分) maxsim = -1 argmaxsim = "000000" + treated_dict = treat_dict(adict) for id in adict: if not id in excludelist: - simrate = Levenshtein.jaro(adict[id]["content"],adict[currentid]["content"]) + simrate = Levenshtein.jaro(treat_dict[id]["content"],treat_dict[currentid]["content"]) if simrate > maxsim: maxsim = simrate argmaxsim = id return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号 +def generate_sim_group(startingids,prodict,max_size,threshold): #生成与已知题块startingids(冒号和逗号连接的字符串)关联程度超过threshold的题目组, 超过max_size即停止 + id_list = generate_number_set(startingids) + output_list = [] + treated_dict = treat_dict(prodict) + continue_flag = True + while continue_flag: + appending_id_list = [] + for oldid in id_list: + for newid in [id for id in prodict if not id in id_list and not id in output_list]: + simrate = Levenshtein.jaro(treated_dict[oldid]["content"],treated_dict[newid]["content"]) + if simrate >= threshold: + appending_id_list.append(newid) + output_list = output_list + id_list.copy() + id_list = appending_id_list.copy() + if len(appending_id_list) == 0 or len(output_list)>max_size: + continue_flag = False + return generate_exp(sorted(output_list)) #返回冒号和逗号连接的题目组字符串 + + def generate_problem_series(startingid,length,adict): #在adict字典里返回从startingid开始的一系列题号, 每一题都是与上一题的关联程度最大的 excludelist = [startingid] currentid = startingid