# coding=gbk
# -*- coding:utf8 -*-
def u(str):
""" 仅用来转换utf-8编码的 """
return str.decode("gbk")
def recognize_one_token(str, dictionary):
""" 输入一句话,匹配第一个词,并返回词结尾的位置 """
print str
if len(str) == 0:
return 0
elif str in dictionary:
return len(str)
else:
return recognize_one_token(str[0:-1], dictionary)
def tokenize(str, dictionary):
if len(str) == 0:
return []
i = recognize_one_token(str, dictionary)
if i == 0:
return tokenize(str[1:], dictionary)
else:
if len(str) == i:
return [str[0:i]] # 避免超界
else:
return [str[0:i]] + tokenize(str[i:], dictionary)
def parse(str):
""" str参数需要unicode编码的 """
""" 1. 分词 (2. 暂时未建立语法结构, 仅以list方式返回分词结果) """
# 1. 建立用来分词的词典
dictionary = map(u, ['十七楼', '我', '我要去', '有什么建议吗', '二十楼', '三十楼'])
# 2. 匹配分词(从前往后,最大化匹配)
return tokenize(str, dictionary)
class Literal:
def __init__(self, stringval):
self.m_stringval = stringval
def content(self):
return self.m_stringval
def match(self, condition):
return self.content() == condition
class When(Literal):
def __init__(self, timestring):
Literal.__init__(self, timestring)
class Where(Literal):
def __init__(self, placestring):
Literal.__init__(self, placestring)
class Who(Literal):
def __init__(self, namestring):
Literal.__init__(self, namestring)
class Cause(Literal):
def __init__(self, causestring):
Literal.__init__(self, causestring)
class Course(Literal):
def __init__(self, coursestring):
Literal.__init__(self, coursestring)
class Result(Literal):
def __init__(self, resultstring):
Literal.__init__(self, resultstring)
class Fact:
""" 6要素方式的模型,其实未必合适,主要是cause, course, result可能需要分得更细 """
def __init__(self, when, where, who, cause, course, result):
self.m_when = when
self.m_where = where
self.m_who = who
self.m_cause = cause
self.m_course = course
self.m_result = result
def when(self):
return self.m_when
def where(self):
return self.m_where
def who(self):
return self.m_who
def cause(self):
return self.m_cause
def course(self):
return self.m_course
def result(self):
return self.m_result
class Query:
def __init__(self, question, facts):
self.m_question = question # 问题文本
self.m_tokens = parse(question) # 对问题文本进行解析
self.m_facts = facts # 事实库
def answer(self):
""" 理解问题,并搜索Fact,并生成自然的答案 """
# 基本思路,从前往后扫描语法树,对照事实上下文理解语义,并给出答案
# 应当是一个状态机,0: 初始,1: 地点意愿
state = 0
result = []
for eachtoken in self.m_tokens:
if u("我要去") == eachtoken:
state = 1
elif state == 1:
# 搜索facts, 匹配地点
for eachfact in self.m_facts:
if eachfact.where() == eachtoken:
result.append(eachfact)
else:
pass
return result
if __name__ == '__main__':
""" 测试代码 """
facts = [Fact(u("4月17号"), u("十七楼"), u("我"), u("需要钱"), u("找张三借10块钱"), u("张三借给了我"))]
facts.append(Fact(u("4月18号"), u("二十楼"), u("我"), u("要开会"), u("要过去"), u("")))
q = Query(u("我要去二十楼"), facts)
for f in q.answer():
print f.when(), f.where(), f.who(), f.cause(), f.course(), f.result()