diff --git a/工具v2/database_tools.py b/工具v2/database_tools.py index d9a3c1dc..4c8fd375 100644 --- a/工具v2/database_tools.py +++ b/工具v2/database_tools.py @@ -69,6 +69,25 @@ def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程 argmaxsim = id return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号 +def stringmaxsim(string,adict,listlength): + # maxsim = -1 + # argmaxsim = "000000" + maxsimlist = [] + string = pre_treating(string) + for id in adict: + if not "OBSOLETE" in adict[id]["content"]: + simrate = Levenshtein.jaro(adict[id]["content"],string) + # if simrate > maxsim: + # maxsim = simrate + # argmaxsim = id + if len(maxsimlist) < listlength: + maxsimlist.append((id,simrate)) + elif simrate > maxsimlist[-1][1]: + maxsimlist = maxsimlist[:-1] + maxsimlist.append((id,simrate)) + maxsimlist = sorted(maxsimlist,key = lambda x: x[1], reverse = True) + return maxsimlist #返回最大关联的listlength个题号及关联系数 + def generate_sim_group(startingids,prodict,max_size,threshold): #生成与已知题块startingids(冒号和逗号连接的字符串)关联程度超过threshold的题目组, 超过max_size即停止 id_list = generate_number_set(startingids) output_list = []