# 加载词表,并确定词表中词最大长度
def load_dict(word_dict_path):
words_dict = {}
max_word_length = 0
with open(word_dict_path, encoding="utf8") as f:
for line in f:
word = line.strip()
words_dict[word] = 0
max_word_length = max(max_word_length, len(word))
return words_dict, max_word_length
def forward_max_matching(toCutString, word_dict, max_length):
words = [] # 保存分词
while toCutString != "":
length = min(max_length, len(toCutString)) # 确认待切分字符串长度和最大长度如果待切分词小于最大词长度时
word = toCutString[:length]
while word not in word_dict:
if len(word) == 1: # 如果词长度是1,那就退出循环
break
word = word[:len(word)-1]
words.append(word)
toCutString = toCutString(len(word):)
return words
上面方法虽然可行,但是当字符串长度特别长的时候耗时比较久,性能上有一些缺陷,这时候我们可以利用前缀字典进行优化,提高代码执行效率
def load_prefix_dict(path):
prefix_dict = {}
with open(path, encoding="utf8") as f:
for line in f:
word = line.strip()
for i in range(1, len(word)):
if word[:i] not in prefix_dict:
prefix_dict[word[:i]] = 0 # 0表示这不是词,但是是词前缀
prefix[word] = 1 # 1表示这是一个词
return prefix_dict
def forward_max_matching(tocutstring, prefix_dict):
if tocutstring == "":
return []
words = [] # 放切好的词
start_index, end_index = 0, 1 # 记录窗口的起始位置
window = tocutstring[start_index: end_index] # 从第一个字开始
find_word = window # 将第一个字先当作默认词
while start_index < len(tocutstring):
# 窗口没有在词典里出现
if window not in prefix_dict or end_index > len(tocutstring):
words.append(find_word) # 证明这个字不是前缀,可以分词
start_index += len(find_word)
end_index = start_index + 1
window = tocutstring[start_index: end_index] # 更新窗口
find_word = window
# 窗口在词典中,而且表示是一个词
elif prefix_dict[window] == 1:
find_word = window # 找了了一个词,但是还要看有没有比他更长的词
end_index += 1
window = string[start_index:end_index]
# 窗口是一个前缀
elif prefix_dict[window] == 0:
end_index += 1 # 向后错一位继续找
window = string[start_index:end_index]
# 最后找到的window如果不在词典里,把单独的字加入切词结果
if prefix_dict.get(window) != 1:
words += list(window)
else:
words.append(window)
return words
利用前缀字典的方法虽然代码稍显复杂,但效率会比第一种方式快很多。