database_tools中新增字符串与数据库中已有题目匹配功能
This commit is contained in:
parent
edadebd5d7
commit
7a4fb0e696
|
|
@ -69,6 +69,25 @@ def detectmaxsim(currentid,excludelist,adict): #检测与已知题目关联程
|
|||
argmaxsim = id
|
||||
return (maxsim,argmaxsim) #返回最大关联系数与关联程度最大的题号
|
||||
|
||||
def stringmaxsim(string,adict,listlength):
|
||||
# maxsim = -1
|
||||
# argmaxsim = "000000"
|
||||
maxsimlist = []
|
||||
string = pre_treating(string)
|
||||
for id in adict:
|
||||
if not "OBSOLETE" in adict[id]["content"]:
|
||||
simrate = Levenshtein.jaro(adict[id]["content"],string)
|
||||
# if simrate > maxsim:
|
||||
# maxsim = simrate
|
||||
# argmaxsim = id
|
||||
if len(maxsimlist) < listlength:
|
||||
maxsimlist.append((id,simrate))
|
||||
elif simrate > maxsimlist[-1][1]:
|
||||
maxsimlist = maxsimlist[:-1]
|
||||
maxsimlist.append((id,simrate))
|
||||
maxsimlist = sorted(maxsimlist,key = lambda x: x[1], reverse = True)
|
||||
return maxsimlist #返回最大关联的listlength个题号及关联系数
|
||||
|
||||
def generate_sim_group(startingids,prodict,max_size,threshold): #生成与已知题块startingids(冒号和逗号连接的字符串)关联程度超过threshold的题目组, 超过max_size即停止
|
||||
id_list = generate_number_set(startingids)
|
||||
output_list = []
|
||||
|
|
|
|||
Reference in New Issue