database_tools中新增寻找关联题块功能
This commit is contained in:
parent
9353176294
commit
ac1590b26e
|
|
@ -60,14 +60,34 @@ def treat_dict(p_dict): #对整个题库字典中的内容部分进行预处理
|
|||
def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程度最大的题目(除外列表之外的部分)
|
||||
maxsim = -1
|
||||
argmaxsim = "000000"
|
||||
treated_dict = treat_dict(adict)
|
||||
for id in adict:
|
||||
if not id in excludelist:
|
||||
simrate = Levenshtein.jaro(adict[id]["content"],adict[currentid]["content"])
|
||||
simrate = Levenshtein.jaro(treat_dict[id]["content"],treat_dict[currentid]["content"])
|
||||
if simrate > maxsim:
|
||||
maxsim = simrate
|
||||
argmaxsim = id
|
||||
return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号
|
||||
|
||||
def generate_sim_group(startingids,prodict,max_size,threshold): #生成与已知题块startingids(冒号和逗号连接的字符串)关联程度超过threshold的题目组, 超过max_size即停止
|
||||
id_list = generate_number_set(startingids)
|
||||
output_list = []
|
||||
treated_dict = treat_dict(prodict)
|
||||
continue_flag = True
|
||||
while continue_flag:
|
||||
appending_id_list = []
|
||||
for oldid in id_list:
|
||||
for newid in [id for id in prodict if not id in id_list and not id in output_list]:
|
||||
simrate = Levenshtein.jaro(treated_dict[oldid]["content"],treated_dict[newid]["content"])
|
||||
if simrate >= threshold:
|
||||
appending_id_list.append(newid)
|
||||
output_list = output_list + id_list.copy()
|
||||
id_list = appending_id_list.copy()
|
||||
if len(appending_id_list) == 0 or len(output_list)>max_size:
|
||||
continue_flag = False
|
||||
return generate_exp(sorted(output_list)) #返回冒号和逗号连接的题目组字符串
|
||||
|
||||
|
||||
def generate_problem_series(startingid,length,adict): #在adict字典里返回从startingid开始的一系列题号, 每一题都是与上一题的关联程度最大的
|
||||
excludelist = [startingid]
|
||||
currentid = startingid
|
||||
|
|
|
|||
Reference in New Issue