4.文本规范化处理

下文中将定义一个规范化模块以处理文本文档规范化，并在后面建立分类器时使用这个处理模块。尽管有许多可用的技术，但是将坚持简化与直接原则，以便于更容易地一步步参照这里的实现。将在模块中实现和使用下面的规范化技术。

扩展缩写词。
通过词形还原实现文本处理规范化。
去除特殊字符与符号。
去停用词。

不在更多的关注拼写纠正及其他高级的技术，但如果你感兴趣，可以集成这些之前讲述过的内容。首先从载入一些依赖的模块开始。来实现缩写词扩展。

contraction.py 折叠源码

 
     
      
        
          # -*- coding: utf-8 -*- 
         
 
          """ 
         
 
          Created on Mon Aug 01 01:11:02 2016 
         
 
          @author: DIP 
         
 
          """ 
         

             
         
 
          CONTRACTION_MAP  
          =  
          { 
         
 
          "ain't" 
          :  
          "is not" 
          , 
         
 
          "aren't" 
          :  
          "are not" 
          , 
         
 
          "can't" 
          :  
          "cannot" 
          , 
         
 
          "can't've" 
          :  
          "cannot have" 
          , 
         
 
          "'cause" 
          :  
          "because" 
          , 
         
 
          "could've" 
          :  
          "could have" 
          , 
         
 
          "couldn't" 
          :  
          "could not" 
          , 
         
 
          "couldn't've" 
          :  
          "could not have" 
          , 
         
 
          "didn't" 
          :  
          "did not" 
          , 
         
 
          "doesn't" 
          :  
          "does not" 
          , 
         
 
          "don't" 
          :  
          "do not" 
          , 
         
 
          "hadn't" 
          :  
          "had not" 
          , 
         
 
          "hadn't've" 
          :  
          "had not have" 
          , 
         
 
          "hasn't" 
          :  
          "has not" 
          , 
         
 
          "haven't" 
          :  
          "have not" 
          , 
         
 
          "he'd" 
          :  
          "he would" 
          , 
         
 
          "he'd've" 
          :  
          "he would have" 
          , 
         
 
          "he'll" 
          :  
          "he will" 
          , 
         
 
          "he'll've" 
          :  
          "he he will have" 
          , 
         
 
          "he's" 
          :  
          "he is" 
          , 
         
 
          "how'd" 
          :  
          "how did" 
          , 
         
 
          "how'd'y" 
          :  
          "how do you" 
          , 
         
 
          "how'll" 
          :  
          "how will" 
          , 
         
 
          "how's" 
          :  
          "how is" 
          , 
         
 
          "I'd" 
          :  
          "I would" 
          , 
         
 
          "I'd've" 
          :  
          "I would have" 
          , 
         
 
          "I'll" 
          :  
          "I will" 
          , 
         
 
          "I'll've" 
          :  
          "I will have" 
          , 
         
 
          "I'm" 
          :  
          "I am" 
          , 
         
 
          "I've" 
          :  
          "I have" 
          , 
         
 
          "i'd" 
          :  
          "i would" 
          , 
         
 
          "i'd've" 
          :  
          "i would have" 
          , 
         
 
          "i'll" 
          :  
          "i will" 
          , 
         
 
          "i'll've" 
          :  
          "i will have" 
          , 
         
 
          "i'm" 
          :  
          "i am" 
          , 
         
 
          "i've" 
          :  
          "i have" 
          , 
         
 
          "isn't" 
          :  
          "is not" 
          , 
         
 
          "it'd" 
          :  
          "it would" 
          , 
         
 
          "it'd've" 
          :  
          "it would have" 
          , 
         
 
          "it'll" 
          :  
          "it will" 
          , 
         
 
          "it'll've" 
          :  
          "it will have" 
          , 
         
 
          "it's" 
          :  
          "it is" 
          , 
         
 
          "let's" 
          :  
          "let us" 
          , 
         
 
          "ma'am" 
          :  
          "madam" 
          , 
         
 
          "mayn't" 
          :  
          "may not" 
          , 
         
 
          "might've" 
          :  
          "might have" 
          , 
         
 
          "mightn't" 
          :  
          "might not" 
          , 
         
 
          "mightn't've" 
          :  
          "might not have" 
          , 
         
 
          "must've" 
          :  
          "must have" 
          , 
         
 
          "mustn't" 
          :  
          "must not" 
          , 
         
 
          "mustn't've" 
          :  
          "must not have" 
          , 
         
 
          "needn't" 
          :  
          "need not" 
          , 
         
 
          "needn't've" 
          :  
          "need not have" 
          , 
         
 
          "o'clock" 
          :  
          "of the clock" 
          , 
         
 
          "oughtn't" 
          :  
          "ought not" 
          , 
         
 
          "oughtn't've" 
          :  
          "ought not have" 
          , 
         
 
          "shan't" 
          :  
          "shall not" 
          , 
         
 
          "sha'n't" 
          :  
          "shall not" 
          , 
         
 
          "shan't've" 
          :  
          "shall not have" 
          , 
         
 
          "she'd" 
          :  
          "she would" 
          , 
         
 
          "she'd've" 
          :  
          "she would have" 
          , 
         
 
          "she'll" 
          :  
          "she will" 
          , 
         
 
          "she'll've" 
          :  
          "she will have" 
          , 
         
 
          "she's" 
          :  
          "she is" 
          , 
         
 
          "should've" 
          :  
          "should have" 
          , 
         
 
          "shouldn't" 
          :  
          "should not" 
          , 
         
 
          "shouldn't've" 
          :  
          "should not have" 
          , 
         
 
          "so've" 
          :  
          "so have" 
          , 
         
 
          "so's" 
          :  
          "so as" 
          , 
         
 
          "that'd" 
          :  
          "that would" 
          , 
         
 
          "that'd've" 
          :  
          "that would have" 
          , 
         
 
          "that's" 
          :  
          "that is" 
          , 
         
 
          "there'd" 
          :  
          "there would" 
          , 
         
 
          "there'd've" 
          :  
          "there would have" 
          , 
         
 
          "there's" 
          :  
          "there is" 
          , 
         
 
          "they'd" 
          :  
          "they would" 
          , 
         
 
          "they'd've" 
          :  
          "they would have" 
          , 
         
 
          "they'll" 
          :  
          "they will" 
          , 
         
 
          "they'll've" 
          :  
          "they will have" 
          , 
         
 
          "they're" 
          :  
          "they are" 
          , 
         
 
          "they've" 
          :  
          "they have" 
          , 
         
 
          "to've" 
          :  
          "to have" 
          , 
         
 
          "wasn't" 
          :  
          "was not" 
          , 
         
 
          "we'd" 
          :  
          "we would" 
          , 
         
 
          "we'd've" 
          :  
          "we would have" 
          , 
         
 
          "we'll" 
          :  
          "we will" 
          , 
         
 
          "we'll've" 
          :  
          "we will have" 
          , 
         
 
          "we're" 
          :  
          "we are" 
          , 
         
 
          "we've" 
          :  
          "we have" 
          , 
         
 
          "weren't" 
          :  
          "were not" 
          , 
         
 
          "what'll" 
          :  
          "what will" 
          , 
         
 
          "what'll've" 
          :  
          "what will have" 
          , 
         
 
          "what're" 
          :  
          "what are" 
          , 
         
 
          "what's" 
          :  
          "what is" 
          , 
         
 
          "what've" 
          :  
          "what have" 
          , 
         
 
          "when's" 
          :  
          "when is" 
          , 
         
 
          "when've" 
          :  
          "when have" 
          , 
         
 
          "where'd" 
          :  
          "where did" 
          , 
         
 
          "where's" 
          :  
          "where is" 
          , 
         
 
          "where've" 
          :  
          "where have" 
          , 
         
 
          "who'll" 
          :  
          "who will" 
          , 
         
 
          "who'll've" 
          :  
          "who will have" 
          , 
         
 
          "who's" 
          :  
          "who is" 
          , 
         
 
          "who've" 
          :  
          "who have" 
          , 
         
 
          "why's" 
          :  
          "why is" 
          , 
         
 
          "why've" 
          :  
          "why have" 
          , 
         
 
          "will've" 
          :  
          "will have" 
          , 
         
 
          "won't" 
          :  
          "will not" 
          , 
         
 
          "won't've" 
          :  
          "will not have" 
          , 
         
 
          "would've" 
          :  
          "would have" 
          , 
         
 
          "wouldn't" 
          :  
          "would not" 
          , 
         
 
          "wouldn't've" 
          :  
          "would not have" 
          , 
         
 
          "y'all" 
          :  
          "you all" 
          , 
         
 
          "y'all'd" 
          :  
          "you all would" 
          , 
         
 
          "y'all'd've" 
          :  
          "you all would have" 
          , 
         
 
          "y'all're" 
          :  
          "you all are" 
          , 
         
 
          "y'all've" 
          :  
          "you all have" 
          , 
         
 
          "you'd" 
          :  
          "you would" 
          , 
         
 
          "you'd've" 
          :  
          "you would have" 
          , 
         
 
          "you'll" 
          :  
          "you will" 
          , 
         
 
          "you'll've" 
          :  
          "you will have" 
          , 
         
 
          "you're" 
          :  
          "you are" 
          , 
         
 
          "you've" 
          :  
          "you have" 
         
 
          } 
         
 
      
 
     
   

下面的代码段显示了必要的引用和依赖项：

 
          from  
          contractions  
          import  
          CONTRACTION_MAP 
         
          import  
          re 
         
          import  
          nltk 
         
          import  
          string 
         
          from  
          nltk.stem  
          import  
          WordNetLemmatizer 
         
          stopword_list  
          =  
          nltk.corpus.stopwords.words( 
          'english' 
          ) 
         
          wnl  
          =  
          WordNetLemmatizer()

上面的代码中，载入了英文的停用词、出自 CONTRACTION_MAP 的缩写映射和 WordNetLemmatizer 的一个实例来实现原型还原。实现，定义一个函数实现文本的切分，它将使用在其他的规范化函数中。下面的函数实现词语切分，并去除分割后符号中的多余空格。

 
          def  
          tokenize_text(text): 
         
          tokens  
          =  
          nltk.word_tokenize(text) 
         
          tokens  
          =  
          [token.strip()  
          for  
          token  
          in  
          tokens] 
         
          return  
          tokens

定义扩展缩写词的函数。输入文本，如果有匹配的缩写，则返回包含扩展缩写词后的文本。下面的代码段有助于实现这些：

 
          def  
          expand_contractions(text, contraction_mapping): 
         
          contractions_pattern  
          =  
          re. 
          compile 
          ( 
          '({})' 
          . 
          format 
          ( 
          '|' 
          .join(contraction_mapping.keys())), 
         
          flags 
          = 
          re.IGNORECASE|re.DOTALL) 
         
          def  
          expand_match(contraction): 
         
          match  
          =  
          contraction.group( 
          0 
          ) 
         
          first_char  
          =  
          match[ 
          0 
          ] 
         
          expanded_contraction  
          =  
          contraction_mapping.get(match)\ 
         
          if  
          contraction_mapping.get(match)\ 
         
          else  
          contraction_mapping.get(match.lower())                       
         
          expanded_contraction  
          =  
          first_char 
          + 
          expanded_contraction[ 
          1 
          :] 
         
          return  
          expanded_contraction 
         
          expanded_text  
          =  
          contractions_pattern.sub(expand_match, text) 
         
          expanded_text  
          =  
          re.sub( 
          "'" 
          , "", expanded_text) 
         
          return  
          expanded_text

既然已经有了扩展缩写词的函数，接下来就可以实现一个使用词形还原函数把单词变换为词基或词根形式的函数已对文本进行规范化处理。下面的函数有助于实现这些：

 
          from  
          pattern.en  
          import  
          tag 
         
          from  
          nltk.corpus  
          import  
          wordnet as wn 
         
          # Annotate text tokens with POS tags 
         
          def  
          pos_tag_text(text): 
         
          def  
          penn_to_wn_tags(pos_tag): 
         
          if  
          pos_tag.startswith( 
          'J' 
          ): 
         
          return  
          wn.ADJ 
         
          elif  
          pos_tag.startswith( 
          'V' 
          ): 
         
          return  
          wn.VERB 
         
          elif  
          pos_tag.startswith( 
          'N' 
          ): 
         
          return  
          wn.NOUN 
         
          elif  
          pos_tag.startswith( 
          'R' 
          ): 
         
          return  
          wn.ADV 
         
          else 
          : 
         
          return  
          None 
         
          tagged_text  
          =  
          tag(text) 
         
          tagged_lower_text  
          =  
          [(word.lower(), penn_to_wn_tags(pos_tag)) 
         
          for  
          word, pos_tag  
          in 
         
          tagged_text] 
         
          return  
          tagged_lower_text 
         
          # lemmatize text based on POS tags    
         
          def  
          lemmatize_text(text): 
         
          pos_tagged_text  
          =  
          pos_tag_text(text) 
         
          lemmatized_tokens  
          =  
          [wnl.lemmatize(word, pos_tag)  
          if  
          pos_tag 
         
          else  
          word                     
         
          for  
          word, pos_tag  
          in  
          pos_tagged_text] 
         
          lemmatized_text  
          =  
          ' ' 
          .join(lemmatized_tokens) 
         
          return  
          lemmatized_text

上面的代码片段描述了两个词形还原函数。主函数是 lemmatize_text，该函数接受文本数据，基于每个词形标签还原词形，接着给用户返回词形还原处理后的文本。为实现这个功能，需要标注每个文本符号的词性标签。使用 pattern 函数库中的 tag 函数对每个符号标注词性标签。因为 WordNetLemmatizer 基于 WordNet 语法格式。将每个单词符号转换为小写，纠正拼写，转换为 WordNet 词性标签，返回这些标注好的单词符号，最后将这些符号送入 lemmatize_text 函数。

下面的函数帮助我们实现了特殊符号和字符的去除：

 
          def  
          remove_special_characters(text): 
         
          tokens  
          =  
          tokenize_text(text) 
         
          pattern  
          =  
          re. 
          compile 
          ( 
          '[{}]' 
          . 
          format 
          (re.escape(string.punctuation))) 
         
          filtered_tokens  
          =  
          filter 
          ( 
          None 
          , [pattern.sub('', token)  
          for  
          token  
          in  
          tokens]) 
         
          filtered_text  
          =  
          ' ' 
          .join(filtered_tokens) 
         
          return  
          filtered_text

通过文本切分去除了一些特殊字符，因此可以去除一些实际上是缩写的标识，但无法在第一步中去除 “s” “re” 等。将在去除停用词时去除它们。然而，也可以不通过文本切分来去除这些特殊字符。通过正则表达式匹配来去除 string.punctuation 中定义的特殊字符。下面的函数有助于去除文本数据中的停用词。

 
          def  
          remove_stopwords(text): 
         
          tokens  
          =  
          tokenize_text(text) 
         
          filtered_tokens  
          =  
          [token  
          for  
          token  
          in  
          tokens  
          if  
          token  
          not  
          in  
          stopword_list] 
         
          filtered_text  
          =  
          ' ' 
          .join(filtered_tokens)    
         
          return  
          filtered_text

既然已经定义了全部的函数，就可以通过将所有函数一个接一个地连接在一起的方式简历文本处理流水线。下面的函数实现上述功能，输入文本文档资料，进行规范化处理，返回规范化处理后的文本文档语料。

 
          def  
          normalize_corpus(corpus, tokenize 
          = 
          False 
          ): 
         
          normalized_corpus  
          =  
          []    
         
          for  
          text  
          in  
          corpus: 
         
          text  
          =  
          expand_contractions(text, CONTRACTION_MAP) 
         
          text  
          =  
          lemmatize_text(text) 
         
          text  
          =  
          remove_special_characters(text) 
         
          text  
          =  
          remove_stopwords(text) 
         
          normalized_corpus.append(text) 
         
          if  
          tokenize: 
         
          text  
          =  
          tokenize_text(text) 
         
          normalized_corpus.append(text) 
         
          return  
          normalized_corpus

至此，完成了文本规范化处理模块所需的全部函数的讨论和实现。

script.py 折叠源码

 
          from  
          contractions  
          import  
          CONTRACTION_MAP 
         
          import  
          re 
         
          import  
          nltk 
         
          import  
          string 
         
          from  
          nltk.stem  
          import  
          WordNetLemmatizer 
         
          stopword_list  
          =  
          nltk.corpus.stopwords.words( 
          'english' 
          ) 
         
          wnl  
          =  
          WordNetLemmatizer() 
         
          def  
          tokenize_text(text): 
         
          tokens  
          =  
          nltk.word_tokenize(text) 
         
          tokens  
          =  
          [token.strip()  
          for  
          token  
          in  
          tokens] 
         
          return  
          tokens 
         
          def  
          expand_contractions(text, contraction_mapping): 
         
          contractions_pattern  
          =  
          re. 
          compile 
          ( 
          '({})' 
          . 
          format 
          ( 
          '|' 
          .join(contraction_mapping.keys())), 
         
          flags 
          = 
          re.IGNORECASE|re.DOTALL) 
         
          def  
          expand_match(contraction): 
         
          match  
          =  
          contraction.group( 
          0 
          ) 
         
          first_char  
          =  
          match[ 
          0 
          ] 
         
          expanded_contraction  
          =  
          contraction_mapping.get(match)\ 
         
          if  
          contraction_mapping.get(match)\ 
         
          else  
          contraction_mapping.get(match.lower())                       
         
          expanded_contraction  
          =  
          first_char 
          + 
          expanded_contraction[ 
          1 
          :] 
         
          return  
          expanded_contraction 
         
          expanded_text  
          =  
          contractions_pattern.sub(expand_match, text) 
         
          expanded_text  
          =  
          re.sub( 
          "'" 
          , "", expanded_text) 
         
          return  
          expanded_text 
         
          from  
          pattern.en  
          import  
          tag 
         
          from  
          nltk.corpus  
          import  
          wordnet as wn 
         
          # Annotate text tokens with POS tags 
         
          def  
          pos_tag_text(text): 
         
          def  
          penn_to_wn_tags(pos_tag): 
         
          if  
          pos_tag.startswith( 
          'J' 
          ): 
         
          return  
          wn.ADJ 
         
          elif  
          pos_tag.startswith( 
          'V' 
          ): 
         
          return  
          wn.VERB 
         
          elif  
          pos_tag.startswith( 
          'N' 
          ): 
         
          return  
          wn.NOUN 
         
          elif  
          pos_tag.startswith( 
          'R' 
          ): 
         
          return  
          wn.ADV 
         
          else 
          : 
         
          return  
          None 
         
          tagged_text  
          =  
          tag(text) 
         
          tagged_lower_text  
          =  
          [(word.lower(), penn_to_wn_tags(pos_tag)) 
         
          for  
          word, pos_tag  
          in 
         
          tagged_text] 
         
          return  
          tagged_lower_text 
         
          # lemmatize text based on POS tags    
         
          def  
          lemmatize_text(text): 
         
          pos_tagged_text  
          =  
          pos_tag_text(text) 
         
          lemmatized_tokens  
          =  
          [wnl.lemmatize(word, pos_tag)  
          if  
          pos_tag 
         
          else  
          word                     
         
          for  
          word, pos_tag  
          in  
          pos_tagged_text] 
         
          lemmatized_text  
          =  
          ' ' 
          .join(lemmatized_tokens) 
         
          return  
          lemmatized_text 
         
          def  
          remove_special_characters(text): 
         
          tokens  
          =  
          tokenize_text(text) 
         
          pattern  
          =  
          re. 
          compile 
          ( 
          '[{}]' 
          . 
          format 
          (re.escape(string.punctuation))) 
         
          filtered_tokens  
          =  
          filter 
          ( 
          None 
          , [pattern.sub('', token)  
          for  
          token  
          in  
          tokens]) 
         
          filtered_text  
          =  
          ' ' 
          .join(filtered_tokens) 
         
          return  
          filtered_text 
         
          def  
          remove_stopwords(text): 
         
          tokens  
          =  
          tokenize_text(text) 
         
          filtered_tokens  
          =  
          [token  
          for  
          token  
          in  
          tokens  
          if  
          token  
          not  
          in  
          stopword_list] 
         
          filtered_text  
          =  
          ' ' 
          .join(filtered_tokens)    
         
          return  
          filtered_text 
         
          def  
          normalize_corpus(corpus, tokenize 
          = 
          False 
          ): 
         
          normalized_corpus  
          =  
          []    
         
          for  
          text  
          in  
          corpus: 
         
          text  
          =  
          expand_contractions(text, CONTRACTION_MAP) 
         
          text  
          =  
          lemmatize_text(text) 
         
          text  
          =  
          remove_special_characters(text) 
         
          text  
          =  
          remove_stopwords(text) 
         
          normalized_corpus.append(text) 
         
          if  
          tokenize: 
         
          text  
          =  
          tokenize_text(text) 
         
          normalized_corpus.append(text) 
         
          return  
          normalized_corpus

4.文本规范化处理

4.文本规范化处理

猜你喜欢