database_tools中新增寻找关联题块功能

This commit is contained in:
weiye.wang 2023-07-08 09:11:00 +08:00
parent 9353176294
commit ac1590b26e
1 changed files with 21 additions and 1 deletions

View File

@ -60,14 +60,34 @@ def treat_dict(p_dict): #对整个题库字典中的内容部分进行预处理
def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程度最大的题目(除外列表之外的部分)
maxsim = -1
argmaxsim = "000000"
treated_dict = treat_dict(adict)
for id in adict:
if not id in excludelist:
simrate = Levenshtein.jaro(adict[id]["content"],adict[currentid]["content"])
simrate = Levenshtein.jaro(treat_dict[id]["content"],treat_dict[currentid]["content"])
if simrate > maxsim:
maxsim = simrate
argmaxsim = id
return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号
def generate_sim_group(startingids,prodict,max_size,threshold): #生成与已知题块startingids(冒号和逗号连接的字符串)关联程度超过threshold的题目组, 超过max_size即停止
id_list = generate_number_set(startingids)
output_list = []
treated_dict = treat_dict(prodict)
continue_flag = True
while continue_flag:
appending_id_list = []
for oldid in id_list:
for newid in [id for id in prodict if not id in id_list and not id in output_list]:
simrate = Levenshtein.jaro(treated_dict[oldid]["content"],treated_dict[newid]["content"])
if simrate >= threshold:
appending_id_list.append(newid)
output_list = output_list + id_list.copy()
id_list = appending_id_list.copy()
if len(appending_id_list) == 0 or len(output_list)>max_size:
continue_flag = False
return generate_exp(sorted(output_list)) #返回冒号和逗号连接的题目组字符串
def generate_problem_series(startingid,length,adict): #在adict字典里返回从startingid开始的一系列题号, 每一题都是与上一题的关联程度最大的
excludelist = [startingid]
currentid = startingid