if freq_ratio > 5.0 and left_ent > 2.5 and right_ent > 2.5 and len(substr) >= 2 and not is_ascii(substr):
下面是优化关键词利用Python实现的demo(转自:http://www.wangzhanyouhua.net www.100ip.net www.seofuwu.cn 网站优化.webinfoextract.com/forum.php?mod=viewthread&tid=20)
def compute_entropy(word_list):
wdict={}
tot_cnt=0
print "%s\t%f"%(substr.encode('utf-8'),freq)
for w in word_list:
try:
if w not in wdict:
wdict[w] += 1
tot_cnt+=1
ent=0.0
for k,v in wdict.items():
p=1.0*v/tot_cnt
ent -= p * math.log(p)
新闻 0.000100
def count_substr_freq():
fp = open("./video.corpus")
return ent
str_freq={}
str_left_word={}
str_right_word={}
python ./findwords.py > result
tot_cnt=0
return p
for line in fp:
line=line.strip('\n')
for i in range(l):
for j in range(i+1,l):
continue
if j - i 0:
else:
left_word='^'
if j < l-1: right_word=st[j+1] else: right_word='%' str_left_word[w].append(left_word) str_right_word[w].append(right_word) tot_cnt+=1 for k,v in str_freq.items(): if v >= 10:
p *= dict[w]
字幕 0.000055
if len(items) < 4:
left_ent=compute_entropy(str_left_word[k])
right_ent=compute_entropy(str_right_word[k])
print "%s\t%f\t%f\t%f"%(k,v*1.0/tot_cnt,left_ent,right_ent)
word_freq[w]=0.0
if __name__ == "__main__":
if len(line) < 2:
count_substr_freq()
fp = open("./substr.freq")
for line in fp:
st = line[0].decode('utf-8')
freq = float(line[1])
continue
for w in st:
if w not in word_freq:
word_freq[w]+=freq
while True:
wdict[w] = 0
x,y = word_freq.popitem()
if x:
freq=y*1.0/tot_cnt
print "%s\t%f"%(x.encode('utf-8'),freq)
items = line.split('\t')
else:
break
except:
break
游戏 0.000050
if __name__ == "__main__":
count_freq()
findwords.py,输出凝合程度高,且左右邻字集合熵都较高的字符串:
def load_dict(filename):
dict={}
for line in fp:
舞蹈 0.000063
line=line.strip('\n')
本方法考虑了3关键词维度:
tot_cnt=0.0
item=line.split('\t')
if len(item) == 2:
return dict
fp=open(filename)
def is_ascii(s):
return all(ord(c) < 128 for c in s)
str_freq={}
for line in fp:
line = line.decode('utf-8')
substr = items[0]
freq = float(items[1])
关键词管 0.000051