def FindSimilarPassageFromSet(news_set, example_tf):
heap = []
tags = []
for tag in example_tf.keys():
tags.append(tag)
for file_path in news_set:
tf = GetTermFreqFromFile(tags, file_path)
if tf == None:
continue
similarity = CosinSimilarityForDict(example_tf, tf)
# 插入堆
if not similarity == None:
heap.append(SimilarPassage(similarity * -1.0, file_path))
# 把最高相似度弹出(有*-1的关系,最小弹出实际是最大弹出)
heapq.heapify(heap)
if len(heap) == 0:
return None
result = heapq.heappop(heap)
if result.Relevant():
print "Similarity: " + str(result.similarity)
news_set.discard(result.file_path)
return result.file_path
else:
return None