diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index 01606b9e..46ee1333 100644 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -1434,6 +1434,14 @@ J: tm_scope: source.j ace_mode: text +JFlex: + type: programming + color: "#EBCA30" + extensions: + - .flex + - .jflex + ace_mode: text + JSON: type: data tm_scope: source.json diff --git a/samples/JFlex/LexScan.flex b/samples/JFlex/LexScan.flex new file mode 100644 index 00000000..f8d877f1 --- /dev/null +++ b/samples/JFlex/LexScan.flex @@ -0,0 +1,742 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * JFlex 1.7.0-SNAPSHOT * + * Copyright (C) 1998-2015 Gerwin Klein * + * All rights reserved. * + * * + * License: BSD * + * * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +package jflex; + +import java_cup.runtime.Symbol; +import java.io.*; +import java.util.Stack; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.HashMap; +import jflex.unicode.UnicodeProperties; + +%% + +%final +%public +%class LexScan +%implements sym, java_cup.runtime.Scanner +%function next_token + +%type Symbol +%unicode + +%column +%line + +%eofclose + +%state COMMENT, STATELIST, MACROS, REGEXPSTART +%state REGEXP, JAVA_CODE, STATES, STRING_CONTENT +%state CHARCLASS, COPY, REPEATEXP, EATWSPNL +%state CTOR_ARG, REGEXP_CODEPOINT_SEQUENCE +%state STRING_CODEPOINT_SEQUENCE, CHARCLASS_CODEPOINT + +%inputstreamctor false + +%cupdebug + +%{ + int balance = 0; + int commentbalance = 0; + int action_line = 0; + int bufferSize = 16384; + + File file; + Stack files = new Stack(); + + StringBuilder userCode = new StringBuilder(); + + String classCode; + String initCode; + String initThrow; + String eofCode; + String eofThrow; + String lexThrow; + String eofVal; + String scanErrorException; + String cupSymbol = "sym"; + + StringBuilder actionText = new StringBuilder(); + StringBuilder string = new StringBuilder(); + + private UnicodeProperties unicodeProperties; + + boolean charCount; + boolean lineCount; + boolean columnCount; + boolean cupCompatible; + boolean cup2Compatible; + boolean cupDebug; + boolean isInteger; + boolean isIntWrap; + boolean isYYEOF; + boolean notUnix; + boolean isPublic; + boolean isFinal; + boolean isAbstract; + boolean bolUsed; + boolean standalone; + boolean debugOption; + boolean caseless; + boolean inclusive_states; + boolean eofclose; + boolean isASCII; + // TODO: In the version of JFlex after 1.6, the InputStream ctor + // TODO: will never be emitted, and this option will cease to exist. + boolean emitInputStreamCtor = Options.emitInputStreamCtor; + + String isImplementing; + String isExtending; + String className = "Yylex"; + String functionName; + String tokenType; + String visibility = "public"; + + List ctorArgs = new ArrayList(); + List ctorTypes = new ArrayList(); + + LexicalStates states = new LexicalStates(); + + List actions = new ArrayList(); + + private int nextState; + + boolean macroDefinition; + + Timer t = new Timer(); + + // CharClasses.init() is delayed until UnicodeProperties.init() has been called, + // since the max char code won't be known until then. + private CharClasses charClasses = new CharClasses(); + + public CharClasses getCharClasses() { + return charClasses; + } + + public int currentLine() { + return yyline; + } + + public void setFile(File file) { + this.file = file; + } + + private Symbol symbol(int type, Object value) { + return new Symbol(type, yyline, yycolumn, value); + } + + private Symbol symbol(int type) { + return new Symbol(type, yyline, yycolumn); + } + + // updates line and column count to the beginning of the first + // non whitespace character in yytext, but leaves yyline+yycolumn + // untouched + private Symbol symbol_countUpdate(int type, Object value) { + int lc = yyline; + int cc = yycolumn; + String text = yytext(); + + for (int i=0; i < text.length(); i++) { + char c = text.charAt(i); + + if (c != '\n' && c != '\r' && c != ' ' && c != '\t' ) + return new Symbol(type, lc, cc, value); + + if (c == '\n') { + lc++; + cc = 0; + } + else + cc++; + } + + return new Symbol(type, yyline, yycolumn, value); + } + + private String makeMacroIdent() { + String matched = yytext().trim(); + return matched.substring(1, matched.length()-1).trim(); + } + + public static String conc(Object a, Object b) { + if (a == null && b == null) return null; + if (a == null) return b.toString(); + if (b == null) return a.toString(); + + return a.toString()+b.toString(); + } + + public static String concExc(Object a, Object b) { + if (a == null && b == null) return null; + if (a == null) return b.toString(); + if (b == null) return a.toString(); + + return a.toString()+", "+b.toString(); + } + + public UnicodeProperties getUnicodeProperties() { + return unicodeProperties; + } + + private void populateDefaultVersionUnicodeProperties() { + try { + unicodeProperties = new UnicodeProperties(); + } catch (UnicodeProperties.UnsupportedUnicodeVersionException e) { + throw new ScannerException + (file, ErrorMessages.UNSUPPORTED_UNICODE_VERSION, yyline); + } + charClasses.init + (Options.jlex ? 127 : unicodeProperties.getMaximumCodePoint(), this); + } + + private void includeFile(String filePath) { + File f = new File(file.getParentFile(), filePath); + if ( !f.canRead() ) + throw new ScannerException(file,ErrorMessages.NOT_READABLE, yyline); + // check for cycle + if (files.search(f) > 0) + throw new ScannerException(file,ErrorMessages.FILE_CYCLE, yyline); + try { + yypushStream( new FileReader(f) ); + files.push(file); + file = f; + Out.println("Including \""+file+"\""); + } + catch (FileNotFoundException e) { + throw new ScannerException(file,ErrorMessages.NOT_READABLE, yyline); + } + } +%} + +%init{ + states.insert("YYINITIAL", true); +%init} + + +Digit = [0-9] +HexDigit = [0-9a-fA-F] +OctDigit = [0-7] + +Number = {Digit}+ +HexNumber = \\ x {HexDigit} {2} +OctNumber = \\ [0-3]? {OctDigit} {1, 2} + +// Unicode4 can encode chars only in the BMP with the 16 bits provided by its +// 4 hex digits. +Unicode4 = \\ u {HexDigit} {4} + +// Unicode6 can encode all Unicode chars, both in the BMP and in the +// supplementary planes -- only 21 bits are required as of Unicode 5.0, +// but its six hex digits provide 24 bits. +Unicode6 = \\ U {HexDigit} {6} + +// see http://www.unicode.org/unicode/reports/tr18/ +WSP = [ \t\b] +WSPNL = [\u2028\u2029\u000A\u000B\u000C\u000D\u0085\t\b\ ] +NWSPNL = [^\u2028\u2029\u000A\u000B\u000C\u000D\u0085\t\b\ ] +NL = [\u2028\u2029\u000A\u000B\u000C\u000D\u0085] | \u000D\u000A +NNL = [^\u2028\u2029\u000A\u000B\u000C\u000D\u0085] + +Ident = {IdentStart} {IdentPart}* +QualIdent = {Ident} ( {WSP}* "." {WSP}* {Ident} )* +QUIL = {QualIdent} ( {WSP}* "," {WSP}* {QualIdent} )* +Array = "[" {WSP}* "]" +ParamPart = {IdentStart}|{IdentPart}|"<"|">"|","|{WSP}|"&"|"?"|"." +GenParam = "<" {ParamPart}+ ">" +ClassT = {Ident} ({WSP}* {GenParam})? +QClassT = {QualIdent} ({WSP}* {GenParam})? +ArrType = ({GenParam} {WSP}*)? {QClassT} ({WSP}* {Array})* + +IdentStart = [:jletter:] +IdentPart = [:jletterdigit:] + +JFlexCommentChar = [^*/]|"/"+[^*/]|"*"+[^*/] +JFlexComment = {JFlexCommentChar}+ + +/* Java comments */ +JavaComment = {TraditionalComment}|{EndOfLineComment} +TraditionalComment = "/*"{CommentContent}\*+"/" +EndOfLineComment = "//".*{NL} + +CommentContent = ([^*]|\*+[^*/])* + +StringCharacter = [^\u2028\u2029\u000A\u000B\u000C\u000D\u0085\"\\] + +CharLiteral = \'([^\u2028\u2029\u000A\u000B\u000C\u000D\u0085\'\\]|{EscapeSequence})\' +StringLiteral = \"({StringCharacter}|{EscapeSequence})*\" + +EscapeSequence = \\[^\u2028\u2029\u000A\u000B\u000C\u000D\u0085]|\\+u{HexDigit}{4}|\\[0-3]?{OctDigit}{1,2} + +/* \\(b|t|n|f|r|\"|\'|\\|[0-3]?{OctDigit}{1,2}|u{HexDigit}{4}) */ + +JavaRest = [^\{\}\"\'/]|"/"[^*/] +JavaCode = ({JavaRest}|{StringLiteral}|{CharLiteral}|{JavaComment})+ + +DottedVersion = [1-9][0-9]*(\.[0-9]+){0,2} + +%% + + { + "%%".*{NL}? { + t.start(); + yybegin(MACROS); + macroDefinition = true; + return symbol(USERCODE,userCode); + } + .*{NL} | .+ { userCode.append(yytext()); } + <> { return symbol(EOF); } +} + + ("%{"|"%init{"|"%initthrow{"|"%eof{"|"%eofthrow{"|"%yylexthrow{"|"%eofval{").*{NL} + { string.setLength(0); yybegin(COPY); } + { + "%}".*{NL} { classCode = conc(classCode,string); yybegin(MACROS); } + "%init}".*{NL} { initCode = conc(initCode,string); yybegin(MACROS); } + "%initthrow}".*{NL} { initThrow = concExc(initThrow,string); yybegin(MACROS); } + "%eof}".*{NL} { eofCode = conc(eofCode,string); yybegin(MACROS); } + "%eofthrow}".*{NL} { eofThrow = concExc(eofThrow,string); yybegin(MACROS); } + "%yylexthrow}".*{NL} { lexThrow = concExc(lexThrow,string); yybegin(MACROS); } + "%eofval}".*{NL} { eofVal = string.toString(); yybegin(MACROS); } + + .*{NL} { string.append(yytext()); } + + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_MACROS); } +} + + + ^"%s" ("tate" "s"?)? {WSP}+ { inclusive_states = true; yybegin(STATELIST); } + ^"%x" ("state" "s"?)? {WSP}+ { inclusive_states = false; yybegin(STATELIST); } + { + {Ident} { states.insert(yytext(),inclusive_states); } + ([\ \t]*","[\ \t]*)|([\ \t]+) { } + {NL} { yybegin(MACROS); } + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_MACROS); } +} + + { + "%char" { charCount = true; } + "%line" { lineCount = true; } + "%column" { columnCount = true; } + "%byaccj" { isInteger = true; + if (eofVal == null) + eofVal = "return 0;"; + eofclose = true; + } + "%cup2" { cup2Compatible = true; + isImplementing = concExc(isImplementing, "Scanner"); + lineCount = true; + columnCount = true; + if (functionName == null) + functionName = "readNextTerminal"; + if (tokenType == null) + tokenType = "ScannerToken"; + if (eofVal == null) + eofVal = "return token(SpecialTerminals.EndOfInputStream);"; + if (!Options.jlex) eofclose = true; + return symbol(UNICODE); // %unicode + } + "%cup" { cupCompatible = true; + isImplementing = concExc(isImplementing, "java_cup.runtime.Scanner"); + if (functionName == null) + functionName = "next_token"; + if (tokenType == null) + tokenType = "java_cup.runtime.Symbol"; + if (eofVal == null) + eofVal = "return new java_cup.runtime.Symbol("+cupSymbol+".EOF);"; + if (!Options.jlex) eofclose = true; + } + "%cupsym"{WSP}+{QualIdent} {WSP}* { cupSymbol = yytext().substring(8).trim(); + if (cupCompatible) Out.warning(ErrorMessages.CUPSYM_AFTER_CUP, yyline); } + "%cupsym"{WSP}+{NNL}* { throw new ScannerException(file,ErrorMessages.QUIL_CUPSYM, yyline); } + "%cupdebug" { cupDebug = true; } + "%eofclose"({WSP}+"true")? { eofclose = true; } + "%eofclose"({WSP}+"false") { eofclose = false; } + "%class"{WSP}+{ClassT} {WSP}* { className = yytext().substring(7).trim(); } + "%ctorarg"{WSP}+{ArrType}{WSP}+ { yybegin(CTOR_ARG); ctorTypes.add(yytext().substring(8).trim()); } + "%function"{WSP}+{Ident} {WSP}* { functionName = yytext().substring(10).trim(); } + "%type"{WSP}+{ArrType} {WSP}* { tokenType = yytext().substring(6).trim(); } + "%integer"|"%int" { isInteger = true; } + "%intwrap" { isIntWrap = true; } + "%yyeof" { isYYEOF = true; } + "%notunix" { notUnix = true; } + "%7bit" { isASCII = true; return symbol(ASCII); } + "%full"|"%8bit" { return symbol(FULL); } + "%16bit" { populateDefaultVersionUnicodeProperties(); + return symbol(UNICODE); + } + "%unicode"({WSP}+{DottedVersion})? { String v = yytext().substring(8).trim(); + if (v.length() == 0) { + populateDefaultVersionUnicodeProperties(); + } else { + try { + unicodeProperties = new UnicodeProperties(v); + } catch (UnicodeProperties.UnsupportedUnicodeVersionException e) { + throw new ScannerException + (file, ErrorMessages.UNSUPPORTED_UNICODE_VERSION, yyline); + } + charClasses.init + (Options.jlex ? 127 : unicodeProperties.getMaximumCodePoint(), this); + } + return symbol(UNICODE); + } + + "%caseless"|"%ignorecase" { caseless = true; } + "%implements"{WSP}+.* { isImplementing = concExc(isImplementing, yytext().substring(12).trim()); } + "%extends"{WSP}+{QClassT}{WSP}* { isExtending = yytext().substring(9).trim(); } + "%public" { isPublic = true; } + "%apiprivate" { visibility = "private"; Skeleton.makePrivate(); } + "%final" { isFinal = true; } + "%abstract" { isAbstract = true; } + "%debug" { debugOption = true; } + "%standalone" { standalone = true; isInteger = true; } + "%pack" { /* no-op - this is the only generation method */ } + "%include" {WSP}+ .* { includeFile(yytext().substring(9).trim()); } + "%buffer" {WSP}+ {Number} {WSP}* { bufferSize = Integer.parseInt(yytext().substring(8).trim()); } + "%buffer" {WSP}+ {NNL}* { throw new ScannerException(file,ErrorMessages.NO_BUFFER_SIZE, yyline); } + "%initthrow" {WSP}+ {QUIL} {WSP}* { initThrow = concExc(initThrow,yytext().substring(11).trim()); } + "%initthrow" {WSP}+ {NNL}* { throw new ScannerException(file,ErrorMessages.QUIL_INITTHROW, yyline); } + "%eofthrow" {WSP}+ {QUIL} {WSP}* { eofThrow = concExc(eofThrow,yytext().substring(10).trim()); } + "%eofthrow" {WSP}+ {NNL}* { throw new ScannerException(file,ErrorMessages.QUIL_EOFTHROW, yyline); } + "%yylexthrow"{WSP}+ {QUIL} {WSP}* { lexThrow = concExc(lexThrow,yytext().substring(12).trim()); } + "%throws" {WSP}+ {QUIL} {WSP}* { lexThrow = concExc(lexThrow,yytext().substring(8).trim()); } + "%yylexthrow"{WSP}+ {NNL}* { throw new ScannerException(file,ErrorMessages.QUIL_YYLEXTHROW, yyline); } + "%throws" {WSP}+ {NNL}* { throw new ScannerException(file,ErrorMessages.QUIL_THROW, yyline); } + "%scanerror" {WSP}+ {QualIdent} {WSP}* { scanErrorException = yytext().substring(11).trim(); } + "%scanerror" {WSP}+ {NNL}* { throw new ScannerException(file,ErrorMessages.QUIL_SCANERROR, yyline); } +// TODO: In the version of JFlex after 1.6, the %inputstreamctor directive will become a no-op: the InputStream ctor will never be emitted. + "%inputstreamctor"({WSP}+"true")? { emitInputStreamCtor = true; } + "%inputstreamctor"{WSP}+"false" { emitInputStreamCtor = false; } + + {Ident} { return symbol(IDENT, yytext()); } + "="{WSP}* { if (null == unicodeProperties && ! isASCII) { + populateDefaultVersionUnicodeProperties(); + } + yybegin(REGEXP); + return symbol(EQUALS); + } + + "/*" { nextState = MACROS; yybegin(COMMENT); } + + {EndOfLineComment} { } + + ^"%%" {NNL}* { if (null == unicodeProperties && ! isASCII) { + populateDefaultVersionUnicodeProperties(); + } + macroDefinition = false; + yybegin(REGEXPSTART); + return symbol(DELIMITER); + } + "%"{Ident} { throw new ScannerException(file,ErrorMessages.UNKNOWN_OPTION, yyline, yycolumn); } + "%" { throw new ScannerException(file,ErrorMessages.UNKNOWN_OPTION, yyline, yycolumn); } + ^{WSP}+"%" { Out.warning(ErrorMessages.NOT_AT_BOL, yyline); yypushback(1); } + + {WSP}+ { } + {NL}+ { } + <> { if ( yymoreStreams() ) { + file = (File) files.pop(); + yypopStream(); + } + else + throw new ScannerException(file,ErrorMessages.EOF_IN_MACROS); + } +} + + { + {Ident} {WSP}* { yybegin(MACROS); ctorArgs.add(yytext().trim()); } + [^] { throw new ScannerException(file,ErrorMessages.CTOR_ARG,yyline,yycolumn); } +} + + { + ^ {WSP}* "%include" {WSP}+ .* { includeFile(yytext().trim().substring(9).trim()); } + {WSP}* "/*" { nextState = REGEXPSTART; yybegin(COMMENT); } + {WSP}* "<" { yybegin(STATES); return symbol_countUpdate(LESSTHAN, null); } + {WSP}* "}" { return symbol_countUpdate(RBRACE, null); } + {WSP}* "//" {NNL}* { } + {WSP}* "<>" {WSPNL}* "{" { actionText.setLength(0); yybegin(JAVA_CODE); + Symbol s = symbol_countUpdate(EOFRULE, null); + action_line = s.left+1; + return s; + } + ^ {WSP}* {NWSPNL} { yypushback(yylength()); yybegin(REGEXP); } + {WSP} | {NL} { } +} + + { + {Ident} { return symbol(IDENT, yytext()); } + "," { return symbol(COMMA); } + {WSPNL}+ { } + + // "{" will be caught in REGEXP + ">"{WSPNL}* { yybegin(REGEXP); return symbol(MORETHAN); } + + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_STATES); } +} + + + { + "<>" {WSPNL}+ "{" { actionText.setLength(0); yybegin(JAVA_CODE); action_line = yyline+1; return symbol(EOFRULE); } + "<>" { throw new ScannerException(file,ErrorMessages.EOF_WO_ACTION); } + + {WSPNL}*"|"{WSP}*$ { if (macroDefinition) { + yybegin(EATWSPNL); + return symbol(BAR); + } + else { + yybegin(REGEXPSTART); + return symbol(NOACTION); + } + } + + // stategroup + "{" { yybegin(REGEXPSTART); return symbol(LBRACE); } + + {WSPNL}*"|" { return symbol(BAR); } + + {WSPNL}*\" { string.setLength(0); nextState = REGEXP; yybegin(STRING_CONTENT); } + {WSPNL}*"\\u{" { string.setLength(0); yybegin(REGEXP_CODEPOINT_SEQUENCE); } + {WSPNL}*"!" { return symbol(BANG); } + {WSPNL}*"~" { return symbol(TILDE); } + {WSPNL}*"(" { return symbol(OPENBRACKET); } + {WSPNL}*")" { return symbol(CLOSEBRACKET); } + {WSPNL}*"*" { return symbol(STAR); } + {WSPNL}*"+" { return symbol(PLUS); } + {WSPNL}*"?" { return symbol(QUESTION); } + {WSPNL}*"$" { return symbol(DOLLAR); } + {WSPNL}*"^" { bolUsed = true; return symbol(HAT); } + {WSPNL}*"." { return symbol(POINT); } + {WSPNL}*"\\R" { return symbol(NEWLINE); } + {WSPNL}*"[" { yybegin(CHARCLASS); return symbol(OPENCLASS); } + {WSPNL}*"/" { return symbol(LOOKAHEAD); } + + {WSPNL}* "{" {WSP}* {Ident} {WSP}* "}" { return symbol_countUpdate(MACROUSE, makeMacroIdent()); } + {WSPNL}* "{" {WSP}* {Number} { yybegin(REPEATEXP); + return symbol(REPEAT, + new Integer(yytext().trim().substring(1).trim())); + } + + {WSPNL}+ "{" { actionText.setLength(0); yybegin(JAVA_CODE); action_line = yyline+1; return symbol(REGEXPEND); } + {NL} { if (macroDefinition) { yybegin(MACROS); } return symbol(REGEXPEND); } + + {WSPNL}*"/*" { nextState = REGEXP; yybegin(COMMENT); } + + {WSPNL}*"//"{NNL}* { } + + {WSP}+ { } + + { + {WSPNL}*"[:jletter:]" { return symbol(JLETTERCLASS); } + {WSPNL}*"[:jletterdigit:]" { return symbol(JLETTERDIGITCLASS); } + {WSPNL}*"[:letter:]" { return symbol(LETTERCLASS); } + {WSPNL}*"[:uppercase:]" { return symbol(UPPERCLASS); } + {WSPNL}*"[:lowercase:]" { return symbol(LOWERCLASS); } + {WSPNL}*"[:digit:]" { return symbol(DIGITCLASS); } + {WSPNL}*"\\d" { return symbol(DIGITCLASS); } + {WSPNL}*"\\D" { return symbol(DIGITCLASSNOT); } + {WSPNL}*"\\s" { return symbol(WHITESPACECLASS); } + {WSPNL}*"\\S" { return symbol(WHITESPACECLASSNOT); } + {WSPNL}*"\\w" { return symbol(WORDCLASS); } + {WSPNL}*"\\W" { return symbol(WORDCLASSNOT); } + {WSPNL}*"\\p{"[^}]*"}" { String trimmedText = yytext().trim(); + String propertyValue = trimmedText.substring(3,trimmedText.length()-1); + IntCharSet set = unicodeProperties.getIntCharSet(propertyValue); + if (null == set) { + throw new ScannerException(file,ErrorMessages.INVALID_UNICODE_PROPERTY, yyline, yycolumn + 3); + } + return symbol(UNIPROPCCLASS, set); + } + {WSPNL}*"\\P{"[^}]*"}" { String trimmedText = yytext().trim(); + String propertyValue = trimmedText.substring(3,trimmedText.length()-1); + IntCharSet set = unicodeProperties.getIntCharSet(propertyValue); + if (null == set) { + throw new ScannerException(file,ErrorMessages.INVALID_UNICODE_PROPERTY, yyline, yycolumn + 3); + } + return symbol(UNIPROPCCLASSNOT, set); + } + } + + . { return symbol(CHAR, yytext().codePointAt(0)); } +} + + {WSPNL}+ { yybegin(REGEXP); } + + + { + "}" { yybegin(REGEXP); return symbol(RBRACE); } + "," {WSP}* {Number} { return symbol(REPEAT, new Integer(yytext().substring(1).trim())); } + {WSP}+ { } + + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_REGEXP); } +} + + { + "{"{Ident}"}" { return symbol(MACROUSE, yytext().substring(1,yylength()-1)); } + "[" { balance++; return symbol(OPENCLASS); } + "]" { if (balance > 0) balance--; else yybegin(REGEXP); return symbol(CLOSECLASS); } + "^" { return symbol(HAT); } + "-" { return symbol(DASH); } + "--" { return symbol(DIFFERENCE); } + "&&" { return symbol(INTERSECTION); } + "||" { /* union is the default operation - '||' can be ignored */ } + "~~" { return symbol(SYMMETRICDIFFERENCE); } + "\\u{" { yybegin(CHARCLASS_CODEPOINT); } + + // this is a hack to keep JLex compatibilty with char class + // expressions like [+-] + "-]" { yypushback(1); yycolumn--; return symbol(CHAR, (int)'-'); } + + \" { string.setLength(0); nextState = CHARCLASS; yybegin(STRING_CONTENT); } + + . { return symbol(CHAR, yytext().codePointAt(0)); } + + \n { throw new ScannerException(file,ErrorMessages.EOL_IN_CHARCLASS,yyline,yycolumn); } + + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_REGEXP); } +} + + { + \" { yybegin(nextState); return symbol(STRING, string.toString()); } + \\\" { string.append('\"'); } + [^\"\\\u2028\u2029\u000A\u000B\u000C\u000D\u0085]+ { string.append(yytext()); } + + {NL} { throw new ScannerException(file,ErrorMessages.UNTERMINATED_STR, yyline, yycolumn); } + + {HexNumber} { string.append( (char) Integer.parseInt(yytext().substring(2,yylength()), 16)); } + {OctNumber} { string.append( (char) Integer.parseInt(yytext().substring(1,yylength()), 8)); } + {Unicode4} { string.append( (char) Integer.parseInt(yytext().substring(2,yylength()), 16)); } + {Unicode6} { int codePoint = Integer.parseInt(yytext().substring(2,yylength()), 16); + if (codePoint <= unicodeProperties.getMaximumCodePoint()) { + string.append(Character.toChars(codePoint)); + } else { + throw new ScannerException(file,ErrorMessages.CODEPOINT_OUT_OF_RANGE, yyline, yycolumn+2); + } + } + + "\\u{" { yybegin(STRING_CODEPOINT_SEQUENCE); } + + \\b { string.append('\b'); } + \\n { string.append('\n'); } + \\t { string.append('\t'); } + \\f { string.append('\f'); } + \\r { string.append('\r'); } + + \\. { string.append(yytext().substring(1, yytext().offsetByCodePoints(1, 1))); } + + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_STRING); } +} + + + { + {HexNumber} { return symbol(CHAR, Integer.parseInt(yytext().substring(2,yylength()), 16)); } + {OctNumber} { return symbol(CHAR, Integer.parseInt(yytext().substring(1,yylength()), 8)); } + {Unicode4} { return symbol(CHAR, Integer.parseInt(yytext().substring(2,yylength()), 16)); } + {Unicode6} { int codePoint = Integer.parseInt(yytext().substring(2,yylength()), 16); + if (codePoint <= unicodeProperties.getMaximumCodePoint()) { + return symbol(CHAR, codePoint); + } else { + throw new ScannerException(file,ErrorMessages.CODEPOINT_OUT_OF_RANGE, yyline, yycolumn+2); + } + } + + \\b { return symbol(CHAR, (int)'\b'); } + \\n { return symbol(CHAR, (int)'\n'); } + \\t { return symbol(CHAR, (int)'\t'); } + \\f { return symbol(CHAR, (int)'\f'); } + \\r { return symbol(CHAR, (int)'\r'); } + + \\. { return symbol(CHAR, yytext().codePointAt(1)); } +} + + + { + "{" { balance++; actionText.append('{'); } + "}" { if (balance > 0) { + balance--; + actionText.append('}'); + } + else { + yybegin(REGEXPSTART); + Action a = new Action(actionText.toString(), action_line); + actions.add(a); + return symbol(ACTION, a); + } + } + + {JavaCode} { actionText.append(yytext()); } + + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_ACTION, action_line-1); } +} + + { + + "/"+ "*" { commentbalance++; } + "*"+ "/" { if (commentbalance > 0) + commentbalance--; + else + yybegin(nextState); + } + + {JFlexComment} { /* ignore */ } + + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_COMMENT); } +} + + { + "}" { yybegin(REGEXP); return symbol(STRING, string.toString()); } + {HexDigit}{1,6} { int codePoint = Integer.parseInt(yytext(), 16); + if (codePoint <= unicodeProperties.getMaximumCodePoint()) { + string.append(Character.toChars(codePoint)); + } else { + throw new ScannerException(file,ErrorMessages.CODEPOINT_OUT_OF_RANGE, yyline, yycolumn); + } + } + {WSPNL}+ { } + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_REGEXP); } +} + + { // Specialized form: newlines disallowed, and doesn't return a symbol + "}" { yybegin(STRING_CONTENT); } + {HexDigit}{1,6} { int codePoint = Integer.parseInt(yytext(), 16); + if (codePoint <= unicodeProperties.getMaximumCodePoint()) { + string.append(Character.toChars(codePoint)); + } else { + throw new ScannerException(file, ErrorMessages.CODEPOINT_OUT_OF_RANGE, yyline, yycolumn); + } + } + {NL} { throw new ScannerException(file,ErrorMessages.UNTERMINATED_STR, yyline, yycolumn); } + {WSP}+ { } + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_STRING); } +} + + { // Specialized form: only one codepoint allowed, no whitespace allowed + {HexDigit}{1,6} "}" { int codePoint = Integer.parseInt(yytext().substring(0, yylength() - 1), 16); + if (codePoint <= unicodeProperties.getMaximumCodePoint()) { + yybegin(CHARCLASS); + return symbol(CHAR, codePoint); + } else { + throw new ScannerException(file, ErrorMessages.CODEPOINT_OUT_OF_RANGE, yyline, yycolumn); + } + } + <> { throw new ScannerException(file,ErrorMessages.EOF_IN_REGEXP); } +} + +. { throw new ScannerException(file,ErrorMessages.UNEXPECTED_CHAR, yyline, yycolumn); } +\R { throw new ScannerException(file,ErrorMessages.UNEXPECTED_NL, yyline, yycolumn); } + +<> { if ( yymoreStreams() ) { + file = (File) files.pop(); + yypopStream(); + } + else { + return symbol(EOF); + } + } diff --git a/samples/JFlex/java.jflex b/samples/JFlex/java.jflex new file mode 100644 index 00000000..b12f15d2 --- /dev/null +++ b/samples/JFlex/java.jflex @@ -0,0 +1,305 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright (C) 1998-2015 Gerwin Klein * + * All rights reserved. * + * * + * License: BSD * + * * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* Java 1.2 language lexer specification */ + +/* Use together with unicode.flex for Unicode preprocesssing */ +/* and java12.cup for a Java 1.2 parser */ + +/* Note that this lexer specification is not tuned for speed. + It is in fact quite slow on integer and floating point literals, + because the input is read twice and the methods used to parse + the numbers are not very fast. + For a production quality application (e.g. a Java compiler) + this could be optimized */ + + +import java_cup.runtime.*; + +%% + +%public +%class Scanner +%implements sym + +%unicode + +%line +%column + +%cup +%cupdebug + +%{ + StringBuilder string = new StringBuilder(); + + private Symbol symbol(int type) { + return new JavaSymbol(type, yyline+1, yycolumn+1); + } + + private Symbol symbol(int type, Object value) { + return new JavaSymbol(type, yyline+1, yycolumn+1, value); + } + + /** + * assumes correct representation of a long value for + * specified radix in scanner buffer from start + * to end + */ + private long parseLong(int start, int end, int radix) { + long result = 0; + long digit; + + for (int i = start; i < end; i++) { + digit = Character.digit(yycharat(i),radix); + result*= radix; + result+= digit; + } + + return result; + } +%} + +/* main character classes */ +LineTerminator = \r|\n|\r\n +InputCharacter = [^\r\n] + +WhiteSpace = {LineTerminator} | [ \t\f] + +/* comments */ +Comment = {TraditionalComment} | {EndOfLineComment} | + {DocumentationComment} + +TraditionalComment = "/*" [^*] ~"*/" | "/*" "*"+ "/" +EndOfLineComment = "//" {InputCharacter}* {LineTerminator}? +DocumentationComment = "/*" "*"+ [^/*] ~"*/" + +/* identifiers */ +Identifier = [:jletter:][:jletterdigit:]* + +/* integer literals */ +DecIntegerLiteral = 0 | [1-9][0-9]* +DecLongLiteral = {DecIntegerLiteral} [lL] + +HexIntegerLiteral = 0 [xX] 0* {HexDigit} {1,8} +HexLongLiteral = 0 [xX] 0* {HexDigit} {1,16} [lL] +HexDigit = [0-9a-fA-F] + +OctIntegerLiteral = 0+ [1-3]? {OctDigit} {1,15} +OctLongLiteral = 0+ 1? {OctDigit} {1,21} [lL] +OctDigit = [0-7] + +/* floating point literals */ +FloatLiteral = ({FLit1}|{FLit2}|{FLit3}) {Exponent}? [fF] +DoubleLiteral = ({FLit1}|{FLit2}|{FLit3}) {Exponent}? + +FLit1 = [0-9]+ \. [0-9]* +FLit2 = \. [0-9]+ +FLit3 = [0-9]+ +Exponent = [eE] [+-]? [0-9]+ + +/* string and character literals */ +StringCharacter = [^\r\n\"\\] +SingleCharacter = [^\r\n\'\\] + +%state STRING, CHARLITERAL + +%% + + { + + /* keywords */ + "abstract" { return symbol(ABSTRACT); } + "boolean" { return symbol(BOOLEAN); } + "break" { return symbol(BREAK); } + "byte" { return symbol(BYTE); } + "case" { return symbol(CASE); } + "catch" { return symbol(CATCH); } + "char" { return symbol(CHAR); } + "class" { return symbol(CLASS); } + "const" { return symbol(CONST); } + "continue" { return symbol(CONTINUE); } + "do" { return symbol(DO); } + "double" { return symbol(DOUBLE); } + "else" { return symbol(ELSE); } + "extends" { return symbol(EXTENDS); } + "final" { return symbol(FINAL); } + "finally" { return symbol(FINALLY); } + "float" { return symbol(FLOAT); } + "for" { return symbol(FOR); } + "default" { return symbol(DEFAULT); } + "implements" { return symbol(IMPLEMENTS); } + "import" { return symbol(IMPORT); } + "instanceof" { return symbol(INSTANCEOF); } + "int" { return symbol(INT); } + "interface" { return symbol(INTERFACE); } + "long" { return symbol(LONG); } + "native" { return symbol(NATIVE); } + "new" { return symbol(NEW); } + "goto" { return symbol(GOTO); } + "if" { return symbol(IF); } + "public" { return symbol(PUBLIC); } + "short" { return symbol(SHORT); } + "super" { return symbol(SUPER); } + "switch" { return symbol(SWITCH); } + "synchronized" { return symbol(SYNCHRONIZED); } + "package" { return symbol(PACKAGE); } + "private" { return symbol(PRIVATE); } + "protected" { return symbol(PROTECTED); } + "transient" { return symbol(TRANSIENT); } + "return" { return symbol(RETURN); } + "void" { return symbol(VOID); } + "static" { return symbol(STATIC); } + "while" { return symbol(WHILE); } + "this" { return symbol(THIS); } + "throw" { return symbol(THROW); } + "throws" { return symbol(THROWS); } + "try" { return symbol(TRY); } + "volatile" { return symbol(VOLATILE); } + "strictfp" { return symbol(STRICTFP); } + + /* boolean literals */ + "true" { return symbol(BOOLEAN_LITERAL, true); } + "false" { return symbol(BOOLEAN_LITERAL, false); } + + /* null literal */ + "null" { return symbol(NULL_LITERAL); } + + + /* separators */ + "(" { return symbol(LPAREN); } + ")" { return symbol(RPAREN); } + "{" { return symbol(LBRACE); } + "}" { return symbol(RBRACE); } + "[" { return symbol(LBRACK); } + "]" { return symbol(RBRACK); } + ";" { return symbol(SEMICOLON); } + "," { return symbol(COMMA); } + "." { return symbol(DOT); } + + /* operators */ + "=" { return symbol(EQ); } + ">" { return symbol(GT); } + "<" { return symbol(LT); } + "!" { return symbol(NOT); } + "~" { return symbol(COMP); } + "?" { return symbol(QUESTION); } + ":" { return symbol(COLON); } + "==" { return symbol(EQEQ); } + "<=" { return symbol(LTEQ); } + ">=" { return symbol(GTEQ); } + "!=" { return symbol(NOTEQ); } + "&&" { return symbol(ANDAND); } + "||" { return symbol(OROR); } + "++" { return symbol(PLUSPLUS); } + "--" { return symbol(MINUSMINUS); } + "+" { return symbol(PLUS); } + "-" { return symbol(MINUS); } + "*" { return symbol(MULT); } + "/" { return symbol(DIV); } + "&" { return symbol(AND); } + "|" { return symbol(OR); } + "^" { return symbol(XOR); } + "%" { return symbol(MOD); } + "<<" { return symbol(LSHIFT); } + ">>" { return symbol(RSHIFT); } + ">>>" { return symbol(URSHIFT); } + "+=" { return symbol(PLUSEQ); } + "-=" { return symbol(MINUSEQ); } + "*=" { return symbol(MULTEQ); } + "/=" { return symbol(DIVEQ); } + "&=" { return symbol(ANDEQ); } + "|=" { return symbol(OREQ); } + "^=" { return symbol(XOREQ); } + "%=" { return symbol(MODEQ); } + "<<=" { return symbol(LSHIFTEQ); } + ">>=" { return symbol(RSHIFTEQ); } + ">>>=" { return symbol(URSHIFTEQ); } + + /* string literal */ + \" { yybegin(STRING); string.setLength(0); } + + /* character literal */ + \' { yybegin(CHARLITERAL); } + + /* numeric literals */ + + /* This is matched together with the minus, because the number is too big to + be represented by a positive integer. */ + "-2147483648" { return symbol(INTEGER_LITERAL, new Integer(Integer.MIN_VALUE)); } + + {DecIntegerLiteral} { return symbol(INTEGER_LITERAL, new Integer(yytext())); } + {DecLongLiteral} { return symbol(INTEGER_LITERAL, new Long(yytext().substring(0,yylength()-1))); } + + {HexIntegerLiteral} { return symbol(INTEGER_LITERAL, new Integer((int) parseLong(2, yylength(), 16))); } + {HexLongLiteral} { return symbol(INTEGER_LITERAL, new Long(parseLong(2, yylength()-1, 16))); } + + {OctIntegerLiteral} { return symbol(INTEGER_LITERAL, new Integer((int) parseLong(0, yylength(), 8))); } + {OctLongLiteral} { return symbol(INTEGER_LITERAL, new Long(parseLong(0, yylength()-1, 8))); } + + {FloatLiteral} { return symbol(FLOATING_POINT_LITERAL, new Float(yytext().substring(0,yylength()-1))); } + {DoubleLiteral} { return symbol(FLOATING_POINT_LITERAL, new Double(yytext())); } + {DoubleLiteral}[dD] { return symbol(FLOATING_POINT_LITERAL, new Double(yytext().substring(0,yylength()-1))); } + + /* comments */ + {Comment} { /* ignore */ } + + /* whitespace */ + {WhiteSpace} { /* ignore */ } + + /* identifiers */ + {Identifier} { return symbol(IDENTIFIER, yytext()); } +} + + { + \" { yybegin(YYINITIAL); return symbol(STRING_LITERAL, string.toString()); } + + {StringCharacter}+ { string.append( yytext() ); } + + /* escape sequences */ + "\\b" { string.append( '\b' ); } + "\\t" { string.append( '\t' ); } + "\\n" { string.append( '\n' ); } + "\\f" { string.append( '\f' ); } + "\\r" { string.append( '\r' ); } + "\\\"" { string.append( '\"' ); } + "\\'" { string.append( '\'' ); } + "\\\\" { string.append( '\\' ); } + \\[0-3]?{OctDigit}?{OctDigit} { char val = (char) Integer.parseInt(yytext().substring(1),8); + string.append( val ); } + + /* error cases */ + \\. { throw new RuntimeException("Illegal escape sequence \""+yytext()+"\""); } + {LineTerminator} { throw new RuntimeException("Unterminated string at end of line"); } +} + + { + {SingleCharacter}\' { yybegin(YYINITIAL); return symbol(CHARACTER_LITERAL, yytext().charAt(0)); } + + /* escape sequences */ + "\\b"\' { yybegin(YYINITIAL); return symbol(CHARACTER_LITERAL, '\b');} + "\\t"\' { yybegin(YYINITIAL); return symbol(CHARACTER_LITERAL, '\t');} + "\\n"\' { yybegin(YYINITIAL); return symbol(CHARACTER_LITERAL, '\n');} + "\\f"\' { yybegin(YYINITIAL); return symbol(CHARACTER_LITERAL, '\f');} + "\\r"\' { yybegin(YYINITIAL); return symbol(CHARACTER_LITERAL, '\r');} + "\\\""\' { yybegin(YYINITIAL); return symbol(CHARACTER_LITERAL, '\"');} + "\\'"\' { yybegin(YYINITIAL); return symbol(CHARACTER_LITERAL, '\'');} + "\\\\"\' { yybegin(YYINITIAL); return symbol(CHARACTER_LITERAL, '\\'); } + \\[0-3]?{OctDigit}?{OctDigit}\' { yybegin(YYINITIAL); + int val = Integer.parseInt(yytext().substring(1,yylength()-1),8); + return symbol(CHARACTER_LITERAL, (char)val); } + + /* error cases */ + \\. { throw new RuntimeException("Illegal escape sequence \""+yytext()+"\""); } + {LineTerminator} { throw new RuntimeException("Unterminated character literal at end of line"); } +} + +/* error fallback */ +[^] { throw new RuntimeException("Illegal character \""+yytext()+ + "\" at line "+yyline+", column "+yycolumn); } +<> { return symbol(EOF); } \ No newline at end of file