倒排表模块

# -*- encoding: utf-8 -*-
'''
@File    :   inverted_index.py
@Time    :   2020/10/06 00:31:42
@Author  :   DataMagician 
@Version :   1.0
@Contact :   [email protected]
'''

# here put the import lib

from jieba import cut as jiebacut
from numpy import ndarray, array

'''加载标准停用词（标点符号）'''
base_stopwords = ['.', '!', '?', '＂', '＃'
    , '＄', '％', '＆', '＇', '（', '）', '＊'
    , '＋', '，', '－', '／', '：', '；', '＜'
    , '＝', '＞', '＠', '［', '＼', '］', '＾'
    , '＿', '｀', '｛', '｜', '｝', '～', '｟'
    , '｠', '｢', '｣', '､', '\u3000', '、'
    , '〃', '〈', '〉', '《', '》', '「', '」'
    , '『', '』', '【', '】', '〔', '〕', '〖'
    , '〗', '〘', '〙', '〚', '〛', '〜', '〝'
    , '〞', '〟', '〰', '〾', '〿', '–', '—'
    , '‘', '’', '‛', '“', '”', '„', '‟', '…'
    , '‧', '﹏', '﹑', '﹔', '·', '.', '!'
    , '?', '"', '#', '$', '%', '&', "'", '('
    , ')', '*', '+', ',', '-', '/', ':', ';'
    , '<', '=', '>', '@', '[', '\\', ']', '^'
    , '_', '`', '{', '|', '}', '~']


# TODO ：构造到排表类

def is_chinese_function(uchar) -> chr:
    '''
     判断一个unicode是否是汉字
    Args:
        uchar: chart形式的字符
    Returns:

    '''
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False


def is_number_function(uchar) -> chr:
    '''
    判断一个unicode是否是数字
    Args:
        uchar:  chart形式的字符

    Returns:

    '''
    if uchar >= u'\u0030' and uchar <= u'\u0039':
        return True
    else:
        return False


def is_alphabet_function(uchar) -> chr:
    '''
    判断一个unicode是否是英文字母
    Args:
        uchar: chart形式的字符
    Returns:

    '''
    """
    
    """
    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
        return True
    else:
        return False


def is_othe_function(uchar) -> chr:
    '''
    判断是否非汉字，数字和英文字符
    Args:
        uchar: chart形式的字符

    Returns:

    '''
    if not (is_chinese_function(uchar) or is_number_function(uchar) or is_alphabet_function(uchar)):
        return True
    else:
        return False


def character_type_token(original) -> str:
    '''

    Args: 字符串形式的文章
        original:

    Returns:

    '''
    '''
    不同字符类型分割
    '''
    make = [0]
    diff = []
    n = 0
    temp = ""
    for char in original:
        if is_chinese_function(char):
            n = 0
        elif is_number_function(char):
            n = 1
        elif is_alphabet_function(char):
            n = 2
        elif is_othe_function(char):
            n = 3
        else:
            n = 4
        make.append(n)
        if (make[-1] - make[-2]) == 0:
            diff.append(char)
        else:
            diff.append("|")
            diff.append(char)
    return "".join(diff).split("|")


def context_function(paper_list) -> (list, set, tuple):
    '''
    连接上下文本列表
    Args: 文章列表
        paper_list:

    Returns:

    '''
    return "".join(paper_list)


def tokenize_chinese_function(original) -> str:
    '''
    中文分词
    Args:
        original: 一段文章字符串

    Returns: 分词的列表

    '''
    return list(jiebacut(context_function(character_type_token(original))))


def word_punct_tokenizer_for_chinese_function(article_list: list
                                              , filter_stop_words=False) -> (list, tuple, ndarray, tuple, dict):
    '''

    Args: 对文章列表分词(中文优先)
        article_list: 文章列表
        filter_stop_words: 是否清理词不必要的停用词
        True是过滤基础停用词，Flase是不过滤停用词，
        如果是 list,tuple,dict,set,ndarray等可以
        "in" 判断的结构则过滤定义的停用词

    Returns:
    '''
    m = len(article_list)
    if filter_stop_words == True:
        return {
    
    paper_num: filter_stop_words_fumction(tokenize_chinese_function(paper)) for paper, paper_num in
                zip(article_list, range(m))}
    elif filter_stop_words == False:
        return {
    
    paper_num: tokenize_chinese_function(paper) for paper, paper_num in zip(article_list, range(m))}
    elif isinstance(filter_stop_words, (list, tuple, dict, ndarray, set)):
        return {
    
    
            paper_num: filter_stop_words_fumction(tokenize_chinese_function(paper), stop_words_dict=filter_stop_words)
            for paper, paper_num in
            zip(article_list, range(m))}


def filter_stop_words_fumction(words_list: (list, ndarray)
                               , stop_words_dict=base_stopwords) -> (list, tuple, set):
    '''
    过滤停用词
    Args:
        words_list: 需要过滤的词列表
        stop_words_dict: 停用词表

    Returns: 过滤停用词后的词列表

    '''
    return [word for word in words_list if word not in stop_words_dict]


def inverted_index_function(original: (list, tuple, ndarray, set)
                            , filter_stop_words=True) -> (bool, list, tuple, ndarray, set):
    '''

    Args: 到排表
        original: 文章列表
        filter_stop_word: 是否清理词不必要的停用词
        True是过滤基础停用词，Flase是不过滤停用词，
        如果是 list,tuple,dict,set,ndarray等可以
        "in" 判断的结构则过滤定义的停用词

    Returns: 文章分词字典表示，词典，倒排表
    '''
    if isinstance(filter_stop_words, (bool, list, tuple, ndarray, set)):
        every_paper_token = word_punct_tokenizer_for_chinese_function(original, filter_stop_words=filter_stop_words)
        all_word_tokens = []
        for No, paper_tokens in every_paper_token.items():
            all_word_tokens += paper_tokens
        distinct_words = set(all_word_tokens)

        inverted_index = dict()
        for word in distinct_words:
            for No, paper_tokens in every_paper_token.items():
                if word in paper_tokens:
                    if word not in inverted_index:
                        inverted_index.update({
    
    word: [No]})
                    else:
                        inverted_index[word].append(No)
    # elif isinstance(filter_stop_words,(list,tuple,dict,ndarray,set)) :
    #    pass

    return {
    
    "article_tokens": every_paper_token, "words_dictionary": distinct_words, "inverted_index": inverted_index}


if __name__ == "__main__":
    instructions_text = ['扫描本机所在网段上有哪些主机是存活的',
                         '端口扫描：输入目标主机ip，扫描某台主机开放了哪些端口',
                         '隐藏扫描，输入目标主机ip，只在目标主机上留下很少的日志信息',
                         'UDP端口扫描：输入目标主机ip，扫描目标主机开放了哪些UDP端口',
                         '操作系统识别：输入目标主机ip，查询是哪个系统',
                         '上传或者同步大型项目文件到服务器',
                         '检查本机网段内ip',
                         '查看本机网段内 激活/在线 的设备',
                         '查询本地公网ip',
                         '上传《网络工具》项目到GPU服务器',
                         '上传《网络工具》项目到华为服务器',
                         '查询系统运行时间',
                         '查询系统开机时间',
                         '查询系统历史启动时间',
                         '存储盘的位置',
                         '酒店开房数据集的位置']
    path = "/Users/zhangjing/linux_tools_for_chinese/4_question_data/question_file.csv"
    print(inverted_index_function(instructions_text, ["主机","扫描"]))
猜你喜欢