Github: https://github.com/yjfiejd/Text_Classification_NN/blob/master/text_classification_6.28.py
# -*- coding:utf8 -*- # @TIME : 2018/6/28 下午10:34 # @Author : Allen # @File : text_classification_6.28.py import nltk from nltk.stem.lancaster import LancasterStemmer import os import json import datetime stemmer = LancasterStemmer() #3种意图 training_data = [] training_data.append({"class":"greeting", "sentence":"how are you?"}) training_data.append({"class":"greeting", "sentence":"how is your day"}) training_data.append({"class":"greeting", "sentence":"good day"}) training_data.append({"class":"greeting", "sentence":"how is it going today?"}) training_data.append({"class":"goodbye", "sentence":"have a nice day"}) training_data.append({"class":"goodbye", "sentence":"see you later"}) training_data.append({"class":"goodbye", "sentence":"have a nice day"}) training_data.append({"class":"goodbye", "sentence":"talk to you soon"}) training_data.append({"class":"sandwich", "sentence":"make me a sandwich"}) training_data.append({"class":"sandwich", "sentence":"can you make a sandwich"}) training_data.append({"class":"sandwich", "sentence":"having a sandwich today"}) training_data.append({"class":"sandwich", "sentence":"what's for lunch?"}) #预处理 words = [] classes = [] documents = [] ignore_words = ['?'] for pattern in training_data: w = nltk.word_tokenize(pattern['sentence']) words.extend(w) documents.append((w, pattern['class'])) if pattern['class'] not in classes: classes.append(pattern['class']) words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words] words = list(set(words)) classes = list(set(classes)) print (len(documents), "documents") print (len(classes), "classes", classes) print (len(words), "unique stemmed words", words) print("************") print(documents) print(classes) print(words) print("************") #bag of word training = [] output = [] output_empty = [0] * len(classes) for doc in documents: bag = [] pattern_words = doc[0] pattern_words = [stemmer.stem(word.lower()) for word in pattern_words] for w in words: bag.append(1) if w in pattern_words else bag.append(0) training.append(bag) output_row = list(output_empty) output_row[classes.index(doc[1])] = 1 output.append(output_row) i = 0 w = documents[i][0] print(w) print([stemmer.stem(word.lower()) for word in w]) print(training[i]) print(output[i]) print("******下面的代码,实现了词袋处理并将输入句子转换为0、1数组******") #使用sigmiod作为激活函数, 不断的调整参数,直到错误率降低到可以接受 import numpy as np import time def sigmoid(x): output = 1/(1 + np.exp(-x)) return output def sigmoid_output_to_derivative(output): return output*(1-output) def clean_up_sentence(sentence): sentence_words = nltk.word_tokenize(sentence) sentence_words = [stemmer.stem(word.lower()) for word in sentence_words] return sentence_words def bow(sentence, words, show_details=False): sentence_words = clean_up_sentence(sentence) bag = [0] * len(words) for s in sentence_words: for i,w in enumerate(words): if w == s: bag[i] = 1 if show_details: print("found in bag: %s" % w) return (np.array(bag)) def think(sentence, show_details=False): x = bow(sentence.lower(), words, show_details) if show_details: print ("sentence:", sentence, "\n bow:", x) # input layer is our bag of words l0 = x # matrix multiplication of input and hidden layer l1 = sigmoid(np.dot(l0, synapse_0)) # output layer l2 = sigmoid(np.dot(l1, synapse_1)) return l2 print("******实现神经网络的训练函数来调整突触的权重******") def train(X, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=False, dropout_percent=0.5): print ("Training with %s neurons, alpha:%s, dropout:%s %s" % (hidden_neurons, str(alpha), dropout, dropout_percent if dropout else '') ) print ("Input matrix: %sx%s Output matrix: %sx%s" % (len(X),len(X[0]),1, len(classes)) ) np.random.seed(1) last_mean_error = 1 # randomly initialize our weights with mean 0 synapse_0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1 synapse_1 = 2*np.random.random((hidden_neurons, len(classes))) - 1 prev_synapse_0_weight_update = np.zeros_like(synapse_0) prev_synapse_1_weight_update = np.zeros_like(synapse_1) synapse_0_direction_count = np.zeros_like(synapse_0) synapse_1_direction_count = np.zeros_like(synapse_1) for j in iter(range(epochs+1)): # Feed forward through layers 0, 1, and 2 layer_0 = X layer_1 = sigmoid(np.dot(layer_0, synapse_0)) if(dropout): layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent)) layer_2 = sigmoid(np.dot(layer_1, synapse_1)) # how much did we miss the target value? layer_2_error = y - layer_2 if (j% 10000) == 0 and j > 5000: # if this 10k iteration's error is greater than the last iteration, break out if np.mean(np.abs(layer_2_error)) < last_mean_error: print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) ) last_mean_error = np.mean(np.abs(layer_2_error)) else: print ("break:", np.mean(np.abs(layer_2_error)), ">", last_mean_error ) break # in what direction is the target value? # were we really sure? if so, don't change too much. layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2) # how much did each l1 value contribute to the l2 error (according to the weights)? layer_1_error = layer_2_delta.dot(synapse_1.T) # in what direction is the target l1? # were we really sure? if so, don't change too much. layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1) synapse_1_weight_update = (layer_1.T.dot(layer_2_delta)) synapse_0_weight_update = (layer_0.T.dot(layer_1_delta)) if(j > 0): synapse_0_direction_count += np.abs(((synapse_0_weight_update > 0)+0) - ((prev_synapse_0_weight_update > 0) + 0)) synapse_1_direction_count += np.abs(((synapse_1_weight_update > 0)+0) - ((prev_synapse_1_weight_update > 0) + 0)) synapse_1 += alpha * synapse_1_weight_update synapse_0 += alpha * synapse_0_weight_update prev_synapse_0_weight_update = synapse_0_weight_update prev_synapse_1_weight_update = synapse_1_weight_update now = datetime.datetime.now() # persist synapses synapse = {'synapse0': synapse_0.tolist(), 'synapse1': synapse_1.tolist(), 'datetime': now.strftime("%Y-%m-%d %H:%M"), 'words': words, 'classes': classes } synapse_file = "synapses.json" with open(synapse_file, 'w') as outfile: json.dump(synapse, outfile, indent=4, sort_keys=True) # print ("saved synapses to:", synapse_file) print("******隐层中只使用了20个神经元,因此比较容易进行调节******") X = np.array(training) y = np.array(output) start_time = time.time() train(X, y, hidden_neurons=20, alpha=0.1, epochs=100000, dropout=False, dropout_percent=0.2) elapsed_time = time.time() - start_time print ("processing time:", elapsed_time, "seconds") print("******预测一个句子属于某个分类的概率******") # probability threshold ERROR_THRESHOLD = 0.2 # load our calculated synapse values synapse_file = 'synapses.json' with open(synapse_file) as data_file: synapse = json.load(data_file) synapse_0 = np.asarray(synapse['synapse0']) synapse_1 = np.asarray(synapse['synapse1']) def classify(sentence, show_details=False): results = think(sentence, show_details) results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ] results.sort(key=lambda x: x[1], reverse=True) return_results =[[classes[r[0]],r[1]] for r in results] print ("%s \n classification: %s" % (sentence, return_results)) return return_results classify("sudo make me a sandwich") classify("how are you today?") classify("talk to you tomorrow") classify("who are you?") classify("make me some lunch") classify("how was your lunch today?") print() classify("good day", show_details=True) #运行的结果 # 12 documents # 3 classes ['greeting', 'sandwich', 'goodbye'] # 26 unique stemmed words ['me', 'can', 'lunch', 'soon', 'good', 'to', 'for', 'see', 'a', 'is', 'what', 'day', "'s", 'going', 'ar', 'lat', 'today', 'it', 'hav', 'you', 'talk', 'yo', 'nic', 'sandwich', 'mak', 'how'] # ************ # [(['how', 'are', 'you', '?'], 'greeting'), (['how', 'is', 'your', 'day'], 'greeting'), (['good', 'day'], 'greeting'), (['how', 'is', 'it', 'going', 'today', '?'], 'greeting'), (['have', 'a', 'nice', 'day'], 'goodbye'), (['see', 'you', 'later'], 'goodbye'), (['have', 'a', 'nice', 'day'], 'goodbye'), (['talk', 'to', 'you', 'soon'], 'goodbye'), (['make', 'me', 'a', 'sandwich'], 'sandwich'), (['can', 'you', 'make', 'a', 'sandwich'], 'sandwich'), (['having', 'a', 'sandwich', 'today'], 'sandwich'), (['what', "'s", 'for', 'lunch', '?'], 'sandwich')] # ['greeting', 'sandwich', 'goodbye'] # ['me', 'can', 'lunch', 'soon', 'good', 'to', 'for', 'see', 'a', 'is', 'what', 'day', "'s", 'going', 'ar', 'lat', 'today', 'it', 'hav', 'you', 'talk', 'yo', 'nic', 'sandwich', 'mak', 'how'] # ************ # ['how', 'are', 'you', '?'] # ['how', 'ar', 'you', '?'] # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1] # [1, 0, 0] # ******下面的代码,实现了词袋处理并将输入句子转换为0、1数组****** # ******实现神经网络的训练函数来调整突触的权重****** # ******隐层中只使用了20个神经元,因此比较容易进行调节****** # Training with 20 neurons, alpha:0.1, dropout:False # Input matrix: 12x26 Output matrix: 1x3 # delta after 10000 iterations:0.006316297034425907 # delta after 20000 iterations:0.0043193475426743615 # delta after 30000 iterations:0.003467478418933735 # delta after 40000 iterations:0.0029698374148845906 # delta after 50000 iterations:0.0026348137914361477 # delta after 60000 iterations:0.0023899968815955614 # delta after 70000 iterations:0.0022012384383537555 # delta after 80000 iterations:0.0020500796559913266 # delta after 90000 iterations:0.0019255650109882149 # delta after 100000 iterations:0.00182073232414893 # processing time: 8.264868974685669 seconds # ******预测一个句子属于某个分类的概率****** # sudo make me a sandwich # classification: [['sandwich', 0.998725720350513]] # how are you today? # classification: [['greeting', 0.9991704815542843]] # talk to you tomorrow # classification: [['goodbye', 0.9916537770480427]] # who are you? # classification: [['greeting', 0.852891610128995]] # make me some lunch # classification: [['sandwich', 0.9756042223385346]] # how was your lunch today? # classification: [['greeting', 0.9854377884742492]] # # found in bag: good # found in bag: day # sentence: good day # bow: [0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0] # good day # classification: [['greeting', 0.9966108820436639]] # # Process finished with exit code 0