BERT代码的解读

 判断是否是汉字使用的是unicode编码

#判断是步是中文字符,汉字的unicode编码最小值为:0x4e00,最大值为0x952f
  def _is_chinese_char(self, cp):
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    '''
    0x4e00-0x9fff cjk 统一字型 常用字 共 20992个(实际只定义到0x9fc3)
0x3400-0x4dff cjk 统一字型扩展表a 少用字 共 6656个
0x20000-0x2a6df cjk 统一字型扩展表b 少用字,历史上使用 共42720个
0xf900-0xfaff cjk 兼容字型 重复字,可统一变体,共同字 共512个
0x2f800-0x2fa1f cjk 兼容字型补遗 可统一变体 共544个
    '''
    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
        (cp >= 0x3400 and cp <= 0x4DBF) or  #
        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
        (cp >= 0x2B820 and cp <= 0x2CEAF) or
        (cp >= 0xF900 and cp <= 0xFAFF) or  #
        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #

['this', 'text', 'is', 'included', 'to', 'make', 'sure', 'unicode', 'is', 'handled', 'properly', ':', '力', '加', '勝', '北', '区', 'ᴵ', '##ᴺ', '##ᵀ', '##ᵃ', '##ছ', '##জ', '##ট', '##ড', '##ণ', '##ত']
Text should be one-sentence-per-line, with empty lines between documents.

all_dovument =[[['this', 'text', 'is', 'included', 'to', 'make', 'sure', 'unicode', 'is', 'handled', 'properly', ':', '力', '加', '勝', '北', '区', 'ᴵ', '##ᴺ', '##ᵀ', '##ᵃ', '##ছ', '##জ', '##ট', '##ড', '##ণ', '##ত'], ['text', 'should', 'be', 'one', '-', 'sentence', '-', 'per', '-', 'line', ',', 'with', 'empty', 'lines', 'between', 'documents', '.'], ['this', 'sample', 'text', 'is', 'public', 'domain', 'and', 'was', 'randomly', 'selected', 'from', 'project', 'gut', '##tenberg', '.']], [['the', 'rain', 'had', 'only', 'ceased', 'with', 'the', 'gray', 'streaks', 'of', 'morning', 'at', 'blazing', 'star', ',', 'and', 'the', 'settlement', 'awoke', 'to', 'a', 'moral', 'sense', 'of', 'clean', '##liness', ',', 'and', 'the', 'finding', 'of', 'forgotten', 'knives', ',', 'tin', 'cups', ',', 'and', 'smaller', 'camp', 'ut', '##ens', '##ils', ',', 'where', 'the', 'heavy', 'showers',

猜你喜欢

转载自blog.csdn.net/qqywm/article/details/85011973