https://github.com/lucyTheSlayer/FFJson
手撸个python目标还是太遥远,先从简单的json解析器开始。捣鼓了半天,似乎是ok了,现总结一下。
1.Lexer 分词器
分词器的目标是将输入的String token化,定义了以下Token种类
public enum FFToken {
Comma,Colon,LBracket,RBracket,LBrace,RBrace,Str,Number,EOF;
}
分词器的实现核心便是一个状态机,当一个token被识别时,便将识别出的token返回给上层调用者:parser。
分词器需要一个peek功能,即偷瞄一下下一个token,这里面我采取的是回退机制,即token获取后,parser可以调用lexer的pushBackToken方法,将token退回去;这样下次getToken时,便还是返回这个token。
分词器的代码如下:
public class FFLexer {
private String str;
private int pos_start;
private int pos_end;
private int last_pos_start;
private int last_pos_end;
private State state;
private boolean bEscape;
private boolean dotted;
private String value;
enum State{
Start,
InNumber,
InStr
}
public FFLexer(String str){
this.str = str;
this.pos_start = 0;
this.pos_end = 0;
this.last_pos_start = 0;
this.last_pos_end = 0;
this.state = State.Start;
this.bEscape = false;
this.dotted = false;
}
public FFToken getToken() throws FFParseException{
this.last_pos_start = this.pos_start;
this.last_pos_end = this.pos_end;
FFToken token = null;
while(token == null){
if(this.pos_start == this.str.length()){
return FFToken.EOF;
}
this.pos_end ++;
boolean ended = false;
if(this.pos_end > this.str.length()){
ended = true;
}
char c = '\0';
if(!ended){
c = endChar();
}
switch (this.state){
case Start:
if(c == '{'){
token = FFToken.LBrace;
}else if(c == '}'){
token = FFToken.RBrace;
}else if(c == '['){
token = FFToken.LBracket;
}else if(c == ']'){
token = FFToken.RBracket;
}else if(c == ','){
token = FFToken.Comma;
}else if(c == ':'){
token = FFToken.Colon;
}else if(c == '"'){
state = State.InStr;
this.pos_start += 1;
}else if(Character.isDigit(c)){
state = State.InNumber;
}else if(c == ' '|| c=='\t'||c=='\n'||c=='\r'){
this.pos_start += 1;
}else{
throw new FFParseException();
}
break;
case InStr:
if(ended){
throw new FFParseException();
}
if(c == '"'){
if(!bEscape){
value = this.str.substring(this.pos_start,this.pos_end - 1);
token = FFToken.Str;
this.state = State.Start;
}else{
this.bEscape = false;
}
}else{
if(c == '\\'){
this.bEscape = true;
}
}
break;
case InNumber:
if(ended){
throw new FFParseException();
}
if(Character.isDigit(c)){
}else if(endChar() == '.'){
if(this.dotted){
value = this.str.substring(this.pos_start,this.pos_end - 1);
token = FFToken.Number;
this.dotted = false;
this.state = State.Start;
this.pos_end --;
}else{
this.dotted = true;
}
}else{
value = this.str.substring(this.pos_start,this.pos_end - 1);
token = FFToken.Number;
this.dotted = false;
this.state = State.Start;
this.pos_end --;
}
break;
}
}
this.pos_start = this.pos_end;
return token;
}
public void pushBackToken(){
this.pos_start = this.last_pos_start;
this.pos_end = this.last_pos_end;
}
public String processedStr(){
return this.str.substring(0,this.pos_end );
}
private char endChar(){
return this.str.charAt(this.pos_end - 1);
}
public String getValue(){
return this.value;
}
}
可见dirty work是挺多的,调了我好久才总算看上去灵光。
2.解析器 parser
parser的实现核心便是函数递归调用,如要解析一个JsonObject,那么
我们首先需要获得一个{,
1)如果下一个token不是},
如果不是第一次匹配,需匹配一个逗号,
匹配一个key,再匹配一个:,再匹配一个value
回到1)处继续判断
否则,匹配一个}
JsonArray的匹配思路也是一样的。
当然,光解析不够,我们还得将转换后的结果保存、返回,这很类似编译器里面的代码生成。
public class FFParser {
private FFLexer ffLexer;
public FFParser(String str){
ffLexer = new FFLexer(str);
}
public FFJsonObject parseJsonObject() throws FFParseException{
return jsonObject();
}
public FFJsonArray parseJsonArray() throws FFParseException{
return jsonArray();
}
private FFJsonObject jsonObject() throws FFParseException{
FFJsonObject ffJsonObject = new FFJsonObject();
match(FFToken.LBrace);
FFToken token;
token = ffLexer.getToken();
ffLexer.pushBackToken();
if(token != FFToken.RBrace){
Object key = key();
match(FFToken.Colon);
Object value = value();
ffJsonObject.put(key,value);
}
do{
token = ffLexer.getToken();
if(token == FFToken.Comma){
token = ffLexer.getToken();
ffLexer.pushBackToken();
if(token != FFToken.RBrace ){
Object key = key();
match(FFToken.Colon);
Object value = value();
ffJsonObject.put(key,value);
token = ffLexer.getToken();
ffLexer.pushBackToken();
}
}else{
ffLexer.pushBackToken();
}
}while (token != FFToken.RBrace && token != FFToken.EOF);
match(FFToken.RBrace);
return ffJsonObject;
}
private FFJsonArray jsonArray() throws FFParseException{
FFJsonArray ffJsonArray = new FFJsonArray();
match(FFToken.LBracket);
FFToken token;
token = ffLexer.getToken();
ffLexer.pushBackToken();
if(token != FFToken.RBracket){
Object value = value();
ffJsonArray.add(value);
}
do{
token = ffLexer.getToken();
if(token == FFToken.Comma){
token = ffLexer.getToken();
ffLexer.pushBackToken();
if(token != FFToken.RBracket ) {
Object value = value();
ffJsonArray.add(value);
token = ffLexer.getToken();
ffLexer.pushBackToken();
}
}else{
ffLexer.pushBackToken();
}
}while (token != FFToken.RBracket && token != FFToken.EOF);
match(FFToken.RBracket);
return ffJsonArray;
}
private Object key() throws FFParseException{
FFToken token;
token = ffLexer.getToken();
if(token == FFToken.Str){
return ffLexer.getValue();
}else if(token == FFToken.Number){
return parseNumber(ffLexer.getValue());
}else{
System.err.println(token);
throw new FFParseException();
}
}
private Object value() throws FFParseException{
FFToken token;
token = ffLexer.getToken();
if(token == FFToken.Str){
return ffLexer.getValue();
}else if(token == FFToken.Number){
return parseNumber(ffLexer.getValue());
}else if(token == FFToken.LBrace){
ffLexer.pushBackToken();
return jsonObject();
}else if(token == FFToken.LBracket){
ffLexer.pushBackToken();;
return jsonArray();
}else{
throw new FFParseException();
}
}
private void match(FFToken expectedToken) throws FFParseException{
FFToken token = ffLexer.getToken();
if(token != expectedToken){
System.err.println(ffLexer.processedStr());
System.err.println(token);
System.err.println(expectedToken);
throw new FFParseException();
}
}
private Object parseNumber(String value){
if(value.contains(".")){
return Double.parseDouble(value);
}else{
return Long.parseLong(value);
}
}
}
最后的工作便是搞个门面类FFJson供外界调用了。代码就不贴在这儿了。
下面是测试结果:
public class App
{
public static void main( String[] args )
{
System.out.println(FFJson.parseObject("{\"abc\":123,\"dddd\":[1,3,3],\"xxx\":{\"a\":1,\"b\":\"a\"}}"));
System.out.println(FFJson.parseArray("[1,2,3,4,5]"));
String str = "{\n" +
" \"code\": 0,\n" +
" \"msg\": \"成功\",\n" +
" \"data\": [\n" +
" {\n" +
" \"name\": \"热榜\",\n" +
" \"type\": 1,\n" +
" \"foods\": [\n" +
" {\n" +
" \"id\": \"123456\",\n" +
" \"name\": \"皮蛋粥\",\n" +
" \"price\": 1.2,\n" +
" \"description\": \"好吃的皮蛋粥\",\n" +
" \"icon\": \"http://xxx.com\",\n" +
" }\n" +
" ]\n" +
" },\n" +
" {\n" +
" \"name\": \"好吃的\",\n" +
" \"type\": 2,\n" +
" \"foods\": [\n" +
" {\n" +
" \"id\": \"123457\",\n" +
" \"name\": \"慕斯蛋糕\",\n" +
" \"price\": 10.9,\n" +
" \"description\": \"美味爽口\",\n" +
" \"icon\": \"http://xxx.com\",\n" +
" }\n" +
" ]\n" +
" }\n" +
" ]\n" +
"}";
FFJsonObject ffJsonObject = FFJson.parseObject(str);
System.out.println(ffJsonObject);
System.out.println(ffJsonObject.getJsonArray("data").getJsonObject(0).getJsonArray("foods").getJsonObject(0).getString("description"));
}
}
其输出为:
{"abc":123,"xxx":{"a":1,"b":"a"},"dddd":[1,3,3]}
[1,2,3,4,5]
{"msg":"成功","code":0,"data":[{"foods":[{"price":1.2,"name":"皮蛋粥","icon":"http://xxx.com","description":"好吃的皮蛋粥","id":"123456"}],"name":"热榜","type":1},{"foods":[{"price":10.9,"name":"慕斯蛋糕","icon":"http://xxx.com","description":"美味爽口","id":"123457"}],"name":"好吃的","type":2}]}
好吃的皮蛋粥
okay,尤其那一长串的链式调用非常的有feel。
题外话:抛去隐藏bug、性能问题不谈,功能性上,一个完整的json类库还缺少与实体类的转换,不过这个不属于编译原理的范畴了,也就不造轮子了,或者等下次有闲情继续完善。