# -*- encoding: utf-8 -*-
'''
@File : inverted_index.py
@Time : 2020/10/06 00:31:42
@Author : DataMagician
@Version : 1.0
@Contact : [email protected]
'''
# here put the import lib
from jieba import cut as jiebacut
from numpy import ndarray, array
'''加载标准停用词(标点符号)'''
base_stopwords = ['.', '!', '?', '"', '#'
, '$', '%', '&', ''', '(', ')', '*'
, '+', ',', '-', '/', ':', ';', '<'
, '=', '>', '@', '[', '\', ']', '^'
, '_', '`', '{', '|', '}', '~', '⦅'
, '⦆', '「', '」', '、', '\u3000', '、'
, '〃', '〈', '〉', '《', '》', '「', '」'
, '『', '』', '【', '】', '〔', '〕', '〖'
, '〗', '〘', '〙', '〚', '〛', '〜', '〝'
, '〞', '〟', '〰', '〾', '〿', '–', '—'
, '‘', '’', '‛', '“', '”', '„', '‟', '…'
, '‧', '﹏', '﹑', '﹔', '·', '.', '!'
, '?', '"', '#', '$', '%', '&', "'", '('
, ')', '*', '+', ',', '-', '/', ':', ';'
, '<', '=', '>', '@', '[', '\\', ']', '^'
, '_', '`', '{', '|', '}', '~']
# TODO :构造到排表类
def is_chinese_function(uchar) -> chr:
'''
判断一个unicode是否是汉字
Args:
uchar: chart形式的字符
Returns:
'''
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
return True
else:
return False
def is_number_function(uchar) -> chr:
'''
判断一个unicode是否是数字
Args:
uchar: chart形式的字符
Returns:
'''
if uchar >= u'\u0030' and uchar <= u'\u0039':
return True
else:
return False
def is_alphabet_function(uchar) -> chr:
'''
判断一个unicode是否是英文字母
Args:
uchar: chart形式的字符
Returns:
'''
"""
"""
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
return True
else:
return False
def is_othe_function(uchar) -> chr:
'''
判断是否非汉字,数字和英文字符
Args:
uchar: chart形式的字符
Returns:
'''
if not (is_chinese_function(uchar) or is_number_function(uchar) or is_alphabet_function(uchar)):
return True
else:
return False
def character_type_token(original) -> str:
'''
Args: 字符串形式的文章
original:
Returns:
'''
'''
不同字符类型分割
'''
make = [0]
diff = []
n = 0
temp = ""
for char in original:
if is_chinese_function(char):
n = 0
elif is_number_function(char):
n = 1
elif is_alphabet_function(char):
n = 2
elif is_othe_function(char):
n = 3
else:
n = 4
make.append(n)
if (make[-1] - make[-2]) == 0:
diff.append(char)
else:
diff.append("|")
diff.append(char)
return "".join(diff).split("|")
def context_function(paper_list) -> (list, set, tuple):
'''
连接上下文本列表
Args: 文章列表
paper_list:
Returns:
'''
return "".join(paper_list)
def tokenize_chinese_function(original) -> str:
'''
中文分词
Args:
original: 一段文章字符串
Returns: 分词的列表
'''
return list(jiebacut(context_function(character_type_token(original))))
def word_punct_tokenizer_for_chinese_function(article_list: list
, filter_stop_words=False) -> (list, tuple, ndarray, tuple, dict):
'''
Args: 对文章列表分词(中文优先)
article_list: 文章列表
filter_stop_words: 是否清理词不必要的停用词
True是过滤基础停用词,Flase是不过滤停用词,
如果是 list,tuple,dict,set,ndarray等可以
"in" 判断的结构则过滤定义的停用词
Returns:
'''
m = len(article_list)
if filter_stop_words == True:
return {
paper_num: filter_stop_words_fumction(tokenize_chinese_function(paper)) for paper, paper_num in
zip(article_list, range(m))}
elif filter_stop_words == False:
return {
paper_num: tokenize_chinese_function(paper) for paper, paper_num in zip(article_list, range(m))}
elif isinstance(filter_stop_words, (list, tuple, dict, ndarray, set)):
return {
paper_num: filter_stop_words_fumction(tokenize_chinese_function(paper), stop_words_dict=filter_stop_words)
for paper, paper_num in
zip(article_list, range(m))}
def filter_stop_words_fumction(words_list: (list, ndarray)
, stop_words_dict=base_stopwords) -> (list, tuple, set):
'''
过滤停用词
Args:
words_list: 需要过滤的词列表
stop_words_dict: 停用词表
Returns: 过滤停用词后的词列表
'''
return [word for word in words_list if word not in stop_words_dict]
def inverted_index_function(original: (list, tuple, ndarray, set)
, filter_stop_words=True) -> (bool, list, tuple, ndarray, set):
'''
Args: 到排表
original: 文章列表
filter_stop_word: 是否清理词不必要的停用词
True是过滤基础停用词,Flase是不过滤停用词,
如果是 list,tuple,dict,set,ndarray等可以
"in" 判断的结构则过滤定义的停用词
Returns: 文章分词字典表示,词典,倒排表
'''
if isinstance(filter_stop_words, (bool, list, tuple, ndarray, set)):
every_paper_token = word_punct_tokenizer_for_chinese_function(original, filter_stop_words=filter_stop_words)
all_word_tokens = []
for No, paper_tokens in every_paper_token.items():
all_word_tokens += paper_tokens
distinct_words = set(all_word_tokens)
inverted_index = dict()
for word in distinct_words:
for No, paper_tokens in every_paper_token.items():
if word in paper_tokens:
if word not in inverted_index:
inverted_index.update({
word: [No]})
else:
inverted_index[word].append(No)
# elif isinstance(filter_stop_words,(list,tuple,dict,ndarray,set)) :
# pass
return {
"article_tokens": every_paper_token, "words_dictionary": distinct_words, "inverted_index": inverted_index}
if __name__ == "__main__":
instructions_text = ['扫描本机所在网段上有哪些主机是存活的',
'端口扫描:输入目标主机ip,扫描某台主机开放了哪些端口',
'隐藏扫描,输入目标主机ip,只在目标主机上留下很少的日志信息',
'UDP端口扫描:输入目标主机ip,扫描目标主机开放了哪些UDP端口',
'操作系统识别:输入目标主机ip,查询是哪个系统',
'上传或者同步大型项目文件到服务器',
'检查本机网段内ip',
'查看本机网段内 激活/在线 的设备',
'查询本地公网ip',
'上传《网络工具》项目到GPU服务器',
'上传《网络工具》项目到华为服务器',
'查询系统运行时间',
'查询系统开机时间',
'查询系统历史启动时间',
'存储盘的位置',
'酒店开房数据集的位置']
path = "/Users/zhangjing/linux_tools_for_chinese/4_question_data/question_file.csv"
print(inverted_index_function(instructions_text, ["主机","扫描"]))
倒排表模块
猜你喜欢
转载自blog.csdn.net/weixin_43069769/article/details/109481888
今日推荐
周排行