如何确定字符串是英语还是Java代码?

考虑以下两个字符串:
1. for (int i = 0; i < b.size(); i++) {
2.do something in English (not necessary to be a sentence).

第一个是Java代码,第二个是英语。如何检测第一个是代码,第二个是英语?
Java代码可能不是可解析的,因为它不是完整的方法/语句/表达式。以下提供了针对此问题的解决方案。有时,代码和英语之间没有清晰的界线,准确性不能达到100%。但是,使用下面的解决方案,您可以轻松地调整程序以适合您的需求。
基本思想是将字符串转换为一组标记。例如,上面的代码行可能变成“ KEY,SEPARATOR,ID,ASSIGN,NUMBER,SEPARATOR,…”。然后我们可以使用简单的规则将代码与英语分开。
标记器类将字符串转换为标记列表。
package lexical;
import java.util.LinkedList;import java.util.regex.Matcher;import java.util.regex.Pattern;
public class Tokenizer {
private class TokenInfo {
public final Pattern regex;
public final int token;

	public TokenInfo(Pattern regex, int token) {
		super();
		this.regex = regex;
		this.token = token;
	}
}

public class Token {
	public final int token;
	public final String sequence;

	public Token(int token, String sequence) {
		super();
		this.token = token;
		this.sequence = sequence;
	}

}

private LinkedList<TokenInfo> tokenInfos;
private LinkedList<Token> tokens;

public Tokenizer() {
	tokenInfos = new LinkedList<TokenInfo>();
	tokens = new LinkedList<Token>();
}

public void add(String regex, int token) {
	tokenInfos
			.add(new TokenInfo(Pattern.compile("^(" + regex + ")"), token));
}

public void tokenize(String str) {
	String s = str.trim();
	tokens.clear();
	while (!s.equals("")) {
		//System.out.println(s);
		boolean match = false;
		for (TokenInfo info : tokenInfos) {
			Matcher m = info.regex.matcher(s);
			if (m.find()) {
				match = true;
				String tok = m.group().trim();
				s = m.replaceFirst("").trim();
				tokens.add(new Token(info.token, tok));
				break;
			}
		}
		if (!match){
			//throw new ParserException("Unexpected character in input: " + s);
			tokens.clear();
			System.out.println("Unexpected character in input: " + s);
			return;
		}

	}
}

public LinkedList<Token> getTokens() {
	return tokens;
}

public String getTokensString() {
	StringBuilder sb = new StringBuilder();
	for (Tokenizer.Token tok : tokens) {
		sb.append(tok.token);
	}

	return sb.toString();
}}

我们可以获得Java关键字,分隔符,运算符,标识符等。如果将映射值分配给标记,则可以将英语字符串转换为标记字符串。
package lexical;
import greenblocks.javaapiexamples.DB;import java.io.IOException;import java.sql.ResultSet;import java.sql.SQLException;import java.util.regex.Matcher;import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import NLP.POSTagger;
public class EnglishOrCode {

private static Tokenizer tokenizer = null;

public static void initializeTokenizer() {
	tokenizer = new Tokenizer();

	//key words
	String keyString = "abstract assert boolean break byte case catch "
			+ "char class const continue default do double else enum"
			+ " extends false final finally float for goto if implements "
			+ "import instanceof int interface long native new null "
			+ "package private protected public return short static "
			+ "strictfp super switch synchronized this throw throws true "
			+ "transient try void volatile while todo";
	String[] keys = keyString.split(" ");
	String keyStr = StringUtils.join(keys, "|");

	tokenizer.add(keyStr, 1);
	tokenizer.add("\\(|\\)|\\{|\\}|\\[|\\]|;|,|\\.|=|>|<|!|~|"
					+ "\\?|:|==|<=|>=|!=|&&|\\|\\||\\+\\+|--|"
					+ "\\+|-|\\*|/|&|\\||\\^|%|\'|\"|\n|\r|\\$|\\#",
					2);//separators, operators, etc

	tokenizer.add("[0-9]+", 3); //number
	tokenizer.add("[a-zA-Z][a-zA-Z0-9_]*", 4);//identifier
	tokenizer.add("@", 4);
}

public static void main(String[] args) throws SQLException, ClassNotFoundException, IOException {
	initializeTokenizer();
	String s = "do something in English";
	if(isEnglish(s)){
		System.out.println("English");
	}else{
		System.out.println("Java Code");
	}

	s = "for (int i = 0; i < b.size(); i++) {";
	if(isEnglish(s)){
		System.out.println("English");
	}else{
		System.out.println("Java Code");
	}

}

private static boolean isEnglish(String replaced) {
	tokenizer.tokenize(replaced);
	String patternString = tokenizer.getTokensString();

	if(patternString.matches(".*444.*") || patternString.matches("4+")){
		return true;
	}else{
		return false;
	}
}}

输出:
English
Java Code
最后,开发这么多年我也总结了一套学习Java的资料与面试题,如果你在技术上面想提升自己的话,可以关注我,私信发送领取资料或者在评论区留下自己的联系方式,有时间记得帮我点下转发让跟多的人看到哦。在这里插入图片描述

发布了98 篇原创文章 · 获赞 16 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/zhaozihao594/article/details/104278503