mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 09:40:21 +00:00
Replace the tokenizer with a flex-based scanner (#3846)
* Lex everything except SGML, multiline, SHEBANG * Prepend SHEBANG#! to tokens * Support SGML tag/attribute extraction * Multiline comments * WIP cont'd; productionifying * Compile before test * Add extension to gemspec * Add flex task to build lexer * Reentrant extra data storage * regenerate lexer * use prefix * rebuild lexer on linux * Optimise a number of operations: * Don't read and split the entire file if we only ever use the first/last n lines * Only consider the first 50KiB when using heuristics/classifying. This can save a *lot* of time; running a large number of regexes over 1MiB of text takes a while. * Memoize File.size/read/stat; re-reading in a 500KiB file every time `data` is called adds up a lot. * Use single regex for C++ * act like #lines * [1][-2..-1] => nil, ffs * k may not be set
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -8,3 +8,6 @@ lib/linguist/samples.json
|
|||||||
/node_modules
|
/node_modules
|
||||||
test/fixtures/ace_modes.json
|
test/fixtures/ace_modes.json
|
||||||
/vendor/gems/
|
/vendor/gems/
|
||||||
|
/tmp
|
||||||
|
*.bundle
|
||||||
|
*.so
|
||||||
|
|||||||
23
Rakefile
23
Rakefile
@@ -1,6 +1,7 @@
|
|||||||
require 'bundler/setup'
|
require 'bundler/setup'
|
||||||
require 'rake/clean'
|
require 'rake/clean'
|
||||||
require 'rake/testtask'
|
require 'rake/testtask'
|
||||||
|
require 'rake/extensiontask'
|
||||||
require 'yaml'
|
require 'yaml'
|
||||||
require 'yajl'
|
require 'yajl'
|
||||||
require 'open-uri'
|
require 'open-uri'
|
||||||
@@ -10,8 +11,14 @@ task :default => :test
|
|||||||
|
|
||||||
Rake::TestTask.new
|
Rake::TestTask.new
|
||||||
|
|
||||||
|
gem_spec = Gem::Specification.load('github-linguist.gemspec')
|
||||||
|
|
||||||
|
Rake::ExtensionTask.new('linguist', gem_spec) do |ext|
|
||||||
|
ext.lib_dir = File.join('lib', 'linguist')
|
||||||
|
end
|
||||||
|
|
||||||
# Extend test task to check for samples and fetch latest Ace modes
|
# Extend test task to check for samples and fetch latest Ace modes
|
||||||
task :test => [:check_samples, :fetch_ace_modes]
|
task :test => [:compile, :check_samples, :fetch_ace_modes]
|
||||||
|
|
||||||
desc "Check that we have samples.json generated"
|
desc "Check that we have samples.json generated"
|
||||||
task :check_samples do
|
task :check_samples do
|
||||||
@@ -34,12 +41,24 @@ task :fetch_ace_modes do
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
task :samples do
|
task :samples => :compile do
|
||||||
require 'linguist/samples'
|
require 'linguist/samples'
|
||||||
json = Yajl.dump(Linguist::Samples.data, :pretty => true)
|
json = Yajl.dump(Linguist::Samples.data, :pretty => true)
|
||||||
File.write 'lib/linguist/samples.json', json
|
File.write 'lib/linguist/samples.json', json
|
||||||
end
|
end
|
||||||
|
|
||||||
|
FLEX_MIN_VER = [2, 5, 39]
|
||||||
|
task :flex do
|
||||||
|
if `flex -V` !~ /^flex (\d+)\.(\d+)\.(\d+)/
|
||||||
|
fail "flex not detected"
|
||||||
|
end
|
||||||
|
maj, min, rev = $1.to_i, $2.to_i, $3.to_i
|
||||||
|
if maj < FLEX_MIN_VER[0] || (maj == FLEX_MIN_VER[0] && (min < FLEX_MIN_VER[1] || (min == FLEX_MIN_VER[1] && rev < FLEX_MIN_VER[2])))
|
||||||
|
fail "building linguist's lexer requires at least flex #{FLEX_MIN_VER.join(".")}"
|
||||||
|
end
|
||||||
|
system "cd ext/linguist && flex tokenizer.l"
|
||||||
|
end
|
||||||
|
|
||||||
task :build_gem => :samples do
|
task :build_gem => :samples do
|
||||||
rm_rf "grammars"
|
rm_rf "grammars"
|
||||||
sh "script/convert-grammars"
|
sh "script/convert-grammars"
|
||||||
|
|||||||
3
ext/linguist/extconf.rb
Normal file
3
ext/linguist/extconf.rb
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
require 'mkmf'
|
||||||
|
dir_config('linguist')
|
||||||
|
create_makefile('linguist/linguist')
|
||||||
8269
ext/linguist/lex.linguist_yy.c
Normal file
8269
ext/linguist/lex.linguist_yy.c
Normal file
File diff suppressed because it is too large
Load Diff
353
ext/linguist/lex.linguist_yy.h
Normal file
353
ext/linguist/lex.linguist_yy.h
Normal file
@@ -0,0 +1,353 @@
|
|||||||
|
#ifndef linguist_yyHEADER_H
|
||||||
|
#define linguist_yyHEADER_H 1
|
||||||
|
#define linguist_yyIN_HEADER 1
|
||||||
|
|
||||||
|
#line 6 "lex.linguist_yy.h"
|
||||||
|
|
||||||
|
#define YY_INT_ALIGNED short int
|
||||||
|
|
||||||
|
/* A lexical scanner generated by flex */
|
||||||
|
|
||||||
|
#define FLEX_SCANNER
|
||||||
|
#define YY_FLEX_MAJOR_VERSION 2
|
||||||
|
#define YY_FLEX_MINOR_VERSION 5
|
||||||
|
#define YY_FLEX_SUBMINOR_VERSION 39
|
||||||
|
#if YY_FLEX_SUBMINOR_VERSION > 0
|
||||||
|
#define FLEX_BETA
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* First, we deal with platform-specific or compiler-specific issues. */
|
||||||
|
|
||||||
|
/* begin standard C headers. */
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
/* end standard C headers. */
|
||||||
|
|
||||||
|
/* flex integer type definitions */
|
||||||
|
|
||||||
|
#ifndef FLEXINT_H
|
||||||
|
#define FLEXINT_H
|
||||||
|
|
||||||
|
/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
|
||||||
|
|
||||||
|
#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
|
||||||
|
|
||||||
|
/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
|
||||||
|
* if you want the limit (max/min) macros for int types.
|
||||||
|
*/
|
||||||
|
#ifndef __STDC_LIMIT_MACROS
|
||||||
|
#define __STDC_LIMIT_MACROS 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <inttypes.h>
|
||||||
|
typedef int8_t flex_int8_t;
|
||||||
|
typedef uint8_t flex_uint8_t;
|
||||||
|
typedef int16_t flex_int16_t;
|
||||||
|
typedef uint16_t flex_uint16_t;
|
||||||
|
typedef int32_t flex_int32_t;
|
||||||
|
typedef uint32_t flex_uint32_t;
|
||||||
|
#else
|
||||||
|
typedef signed char flex_int8_t;
|
||||||
|
typedef short int flex_int16_t;
|
||||||
|
typedef int flex_int32_t;
|
||||||
|
typedef unsigned char flex_uint8_t;
|
||||||
|
typedef unsigned short int flex_uint16_t;
|
||||||
|
typedef unsigned int flex_uint32_t;
|
||||||
|
|
||||||
|
/* Limits of integral types. */
|
||||||
|
#ifndef INT8_MIN
|
||||||
|
#define INT8_MIN (-128)
|
||||||
|
#endif
|
||||||
|
#ifndef INT16_MIN
|
||||||
|
#define INT16_MIN (-32767-1)
|
||||||
|
#endif
|
||||||
|
#ifndef INT32_MIN
|
||||||
|
#define INT32_MIN (-2147483647-1)
|
||||||
|
#endif
|
||||||
|
#ifndef INT8_MAX
|
||||||
|
#define INT8_MAX (127)
|
||||||
|
#endif
|
||||||
|
#ifndef INT16_MAX
|
||||||
|
#define INT16_MAX (32767)
|
||||||
|
#endif
|
||||||
|
#ifndef INT32_MAX
|
||||||
|
#define INT32_MAX (2147483647)
|
||||||
|
#endif
|
||||||
|
#ifndef UINT8_MAX
|
||||||
|
#define UINT8_MAX (255U)
|
||||||
|
#endif
|
||||||
|
#ifndef UINT16_MAX
|
||||||
|
#define UINT16_MAX (65535U)
|
||||||
|
#endif
|
||||||
|
#ifndef UINT32_MAX
|
||||||
|
#define UINT32_MAX (4294967295U)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* ! C99 */
|
||||||
|
|
||||||
|
#endif /* ! FLEXINT_H */
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
|
||||||
|
/* The "const" storage-class-modifier is valid. */
|
||||||
|
#define YY_USE_CONST
|
||||||
|
|
||||||
|
#else /* ! __cplusplus */
|
||||||
|
|
||||||
|
/* C99 requires __STDC__ to be defined as 1. */
|
||||||
|
#if defined (__STDC__)
|
||||||
|
|
||||||
|
#define YY_USE_CONST
|
||||||
|
|
||||||
|
#endif /* defined (__STDC__) */
|
||||||
|
#endif /* ! __cplusplus */
|
||||||
|
|
||||||
|
#ifdef YY_USE_CONST
|
||||||
|
#define yyconst const
|
||||||
|
#else
|
||||||
|
#define yyconst
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* An opaque pointer. */
|
||||||
|
#ifndef YY_TYPEDEF_YY_SCANNER_T
|
||||||
|
#define YY_TYPEDEF_YY_SCANNER_T
|
||||||
|
typedef void* yyscan_t;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* For convenience, these vars (plus the bison vars far below)
|
||||||
|
are macros in the reentrant scanner. */
|
||||||
|
#define yyin yyg->yyin_r
|
||||||
|
#define yyout yyg->yyout_r
|
||||||
|
#define yyextra yyg->yyextra_r
|
||||||
|
#define yyleng yyg->yyleng_r
|
||||||
|
#define yytext yyg->yytext_r
|
||||||
|
#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
|
||||||
|
#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
|
||||||
|
#define yy_flex_debug yyg->yy_flex_debug_r
|
||||||
|
|
||||||
|
/* Size of default input buffer. */
|
||||||
|
#ifndef YY_BUF_SIZE
|
||||||
|
#ifdef __ia64__
|
||||||
|
/* On IA-64, the buffer size is 16k, not 8k.
|
||||||
|
* Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
|
||||||
|
* Ditto for the __ia64__ case accordingly.
|
||||||
|
*/
|
||||||
|
#define YY_BUF_SIZE 32768
|
||||||
|
#else
|
||||||
|
#define YY_BUF_SIZE 16384
|
||||||
|
#endif /* __ia64__ */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef YY_TYPEDEF_YY_BUFFER_STATE
|
||||||
|
#define YY_TYPEDEF_YY_BUFFER_STATE
|
||||||
|
typedef struct yy_buffer_state *YY_BUFFER_STATE;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef YY_TYPEDEF_YY_SIZE_T
|
||||||
|
#define YY_TYPEDEF_YY_SIZE_T
|
||||||
|
typedef size_t yy_size_t;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef YY_STRUCT_YY_BUFFER_STATE
|
||||||
|
#define YY_STRUCT_YY_BUFFER_STATE
|
||||||
|
struct yy_buffer_state
|
||||||
|
{
|
||||||
|
FILE *yy_input_file;
|
||||||
|
|
||||||
|
char *yy_ch_buf; /* input buffer */
|
||||||
|
char *yy_buf_pos; /* current position in input buffer */
|
||||||
|
|
||||||
|
/* Size of input buffer in bytes, not including room for EOB
|
||||||
|
* characters.
|
||||||
|
*/
|
||||||
|
yy_size_t yy_buf_size;
|
||||||
|
|
||||||
|
/* Number of characters read into yy_ch_buf, not including EOB
|
||||||
|
* characters.
|
||||||
|
*/
|
||||||
|
yy_size_t yy_n_chars;
|
||||||
|
|
||||||
|
/* Whether we "own" the buffer - i.e., we know we created it,
|
||||||
|
* and can realloc() it to grow it, and should free() it to
|
||||||
|
* delete it.
|
||||||
|
*/
|
||||||
|
int yy_is_our_buffer;
|
||||||
|
|
||||||
|
/* Whether this is an "interactive" input source; if so, and
|
||||||
|
* if we're using stdio for input, then we want to use getc()
|
||||||
|
* instead of fread(), to make sure we stop fetching input after
|
||||||
|
* each newline.
|
||||||
|
*/
|
||||||
|
int yy_is_interactive;
|
||||||
|
|
||||||
|
/* Whether we're considered to be at the beginning of a line.
|
||||||
|
* If so, '^' rules will be active on the next match, otherwise
|
||||||
|
* not.
|
||||||
|
*/
|
||||||
|
int yy_at_bol;
|
||||||
|
|
||||||
|
int yy_bs_lineno; /**< The line count. */
|
||||||
|
int yy_bs_column; /**< The column count. */
|
||||||
|
|
||||||
|
/* Whether to try to fill the input buffer when we reach the
|
||||||
|
* end of it.
|
||||||
|
*/
|
||||||
|
int yy_fill_buffer;
|
||||||
|
|
||||||
|
int yy_buffer_status;
|
||||||
|
|
||||||
|
};
|
||||||
|
#endif /* !YY_STRUCT_YY_BUFFER_STATE */
|
||||||
|
|
||||||
|
void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
|
||||||
|
void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
|
||||||
|
YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
|
||||||
|
void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
|
||||||
|
void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
|
||||||
|
void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
|
||||||
|
void linguist_yypop_buffer_state (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
|
||||||
|
YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
|
||||||
|
YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
|
||||||
|
|
||||||
|
void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
|
||||||
|
void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
|
||||||
|
void linguist_yyfree (void * ,yyscan_t yyscanner );
|
||||||
|
|
||||||
|
/* Begin user sect3 */
|
||||||
|
|
||||||
|
#define yytext_ptr yytext_r
|
||||||
|
|
||||||
|
#ifdef YY_HEADER_EXPORT_START_CONDITIONS
|
||||||
|
#define INITIAL 0
|
||||||
|
#define sgml 1
|
||||||
|
#define c_comment 2
|
||||||
|
#define xml_comment 3
|
||||||
|
#define haskell_comment 4
|
||||||
|
#define ocaml_comment 5
|
||||||
|
#define python_dcomment 6
|
||||||
|
#define python_scomment 7
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef YY_NO_UNISTD_H
|
||||||
|
/* Special case for "unistd.h", since it is non-ANSI. We include it way
|
||||||
|
* down here because we want the user's section 1 to have been scanned first.
|
||||||
|
* The user has a chance to override it with an option.
|
||||||
|
*/
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define YY_EXTRA_TYPE struct tokenizer_extra *
|
||||||
|
|
||||||
|
int linguist_yylex_init (yyscan_t* scanner);
|
||||||
|
|
||||||
|
int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
|
||||||
|
|
||||||
|
/* Accessor methods to globals.
|
||||||
|
These are made visible to non-reentrant scanners for convenience. */
|
||||||
|
|
||||||
|
int linguist_yylex_destroy (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
int linguist_yyget_debug (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
|
||||||
|
|
||||||
|
YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
|
||||||
|
|
||||||
|
FILE *linguist_yyget_in (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
void linguist_yyset_in (FILE * in_str ,yyscan_t yyscanner );
|
||||||
|
|
||||||
|
FILE *linguist_yyget_out (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
void linguist_yyset_out (FILE * out_str ,yyscan_t yyscanner );
|
||||||
|
|
||||||
|
yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
char *linguist_yyget_text (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
int linguist_yyget_lineno (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
|
||||||
|
|
||||||
|
int linguist_yyget_column (yyscan_t yyscanner );
|
||||||
|
|
||||||
|
void linguist_yyset_column (int column_no ,yyscan_t yyscanner );
|
||||||
|
|
||||||
|
/* Macros after this point can all be overridden by user definitions in
|
||||||
|
* section 1.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef YY_SKIP_YYWRAP
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" int linguist_yywrap (yyscan_t yyscanner );
|
||||||
|
#else
|
||||||
|
extern int linguist_yywrap (yyscan_t yyscanner );
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef yytext_ptr
|
||||||
|
static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef YY_NEED_STRLEN
|
||||||
|
static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef YY_NO_INPUT
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Amount of stuff to slurp up with each read. */
|
||||||
|
#ifndef YY_READ_BUF_SIZE
|
||||||
|
#ifdef __ia64__
|
||||||
|
/* On IA-64, the buffer size is 16k, not 8k */
|
||||||
|
#define YY_READ_BUF_SIZE 16384
|
||||||
|
#else
|
||||||
|
#define YY_READ_BUF_SIZE 8192
|
||||||
|
#endif /* __ia64__ */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Number of entries by which start-condition stack grows. */
|
||||||
|
#ifndef YY_START_STACK_INCR
|
||||||
|
#define YY_START_STACK_INCR 25
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Default declaration of generated scanner - a define so the user can
|
||||||
|
* easily add parameters.
|
||||||
|
*/
|
||||||
|
#ifndef YY_DECL
|
||||||
|
#define YY_DECL_IS_OURS 1
|
||||||
|
|
||||||
|
extern int linguist_yylex (yyscan_t yyscanner);
|
||||||
|
|
||||||
|
#define YY_DECL int linguist_yylex (yyscan_t yyscanner)
|
||||||
|
#endif /* !YY_DECL */
|
||||||
|
|
||||||
|
/* yy_get_previous_state - get the state just before the EOB char was reached */
|
||||||
|
|
||||||
|
#undef YY_NEW_FILE
|
||||||
|
#undef YY_FLUSH_BUFFER
|
||||||
|
#undef yy_set_bol
|
||||||
|
#undef yy_new_buffer
|
||||||
|
#undef yy_set_interactive
|
||||||
|
#undef YY_DO_BEFORE_ACTION
|
||||||
|
|
||||||
|
#ifdef YY_DECL_IS_OURS
|
||||||
|
#undef YY_DECL_IS_OURS
|
||||||
|
#undef YY_DECL
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#line 117 "tokenizer.l"
|
||||||
|
|
||||||
|
|
||||||
|
#line 352 "lex.linguist_yy.h"
|
||||||
|
#undef linguist_yyIN_HEADER
|
||||||
|
#endif /* linguist_yyHEADER_H */
|
||||||
64
ext/linguist/linguist.c
Normal file
64
ext/linguist/linguist.c
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
#include "ruby.h"
|
||||||
|
#include "linguist.h"
|
||||||
|
#include "lex.linguist_yy.h"
|
||||||
|
|
||||||
|
int linguist_yywrap(yyscan_t yyscanner) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static VALUE rb_tokenizer_extract_tokens(VALUE self, VALUE rb_data) {
|
||||||
|
YY_BUFFER_STATE buf;
|
||||||
|
yyscan_t scanner;
|
||||||
|
struct tokenizer_extra extra;
|
||||||
|
VALUE ary, s;
|
||||||
|
long len;
|
||||||
|
int r;
|
||||||
|
|
||||||
|
Check_Type(rb_data, T_STRING);
|
||||||
|
|
||||||
|
len = RSTRING_LEN(rb_data);
|
||||||
|
if (len > 100000)
|
||||||
|
len = 100000;
|
||||||
|
|
||||||
|
linguist_yylex_init_extra(&extra, &scanner);
|
||||||
|
buf = linguist_yy_scan_bytes(RSTRING_PTR(rb_data), (int) len, scanner);
|
||||||
|
|
||||||
|
ary = rb_ary_new();
|
||||||
|
do {
|
||||||
|
extra.type = NO_ACTION;
|
||||||
|
extra.token = NULL;
|
||||||
|
r = linguist_yylex(scanner);
|
||||||
|
switch (extra.type) {
|
||||||
|
case NO_ACTION:
|
||||||
|
break;
|
||||||
|
case REGULAR_TOKEN:
|
||||||
|
rb_ary_push(ary, rb_str_new2(extra.token));
|
||||||
|
free(extra.token);
|
||||||
|
break;
|
||||||
|
case SHEBANG_TOKEN:
|
||||||
|
s = rb_str_new2("SHEBANG#!");
|
||||||
|
rb_str_cat2(s, extra.token);
|
||||||
|
rb_ary_push(ary, s);
|
||||||
|
free(extra.token);
|
||||||
|
break;
|
||||||
|
case SGML_TOKEN:
|
||||||
|
s = rb_str_new2(extra.token);
|
||||||
|
rb_str_cat2(s, ">");
|
||||||
|
rb_ary_push(ary, s);
|
||||||
|
free(extra.token);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} while (r);
|
||||||
|
|
||||||
|
linguist_yy_delete_buffer(buf, scanner);
|
||||||
|
linguist_yylex_destroy(scanner);
|
||||||
|
|
||||||
|
return ary;
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((visibility("default"))) void Init_linguist() {
|
||||||
|
VALUE rb_mLinguist = rb_define_module("Linguist");
|
||||||
|
VALUE rb_cTokenizer = rb_define_class_under(rb_mLinguist, "Tokenizer", rb_cObject);
|
||||||
|
|
||||||
|
rb_define_method(rb_cTokenizer, "extract_tokens", rb_tokenizer_extract_tokens, 1);
|
||||||
|
}
|
||||||
11
ext/linguist/linguist.h
Normal file
11
ext/linguist/linguist.h
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
enum tokenizer_type {
|
||||||
|
NO_ACTION,
|
||||||
|
REGULAR_TOKEN,
|
||||||
|
SHEBANG_TOKEN,
|
||||||
|
SGML_TOKEN,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct tokenizer_extra {
|
||||||
|
char *token;
|
||||||
|
enum tokenizer_type type;
|
||||||
|
};
|
||||||
119
ext/linguist/tokenizer.l
Normal file
119
ext/linguist/tokenizer.l
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
%{
|
||||||
|
|
||||||
|
#include "linguist.h"
|
||||||
|
|
||||||
|
#define feed_token(tok, typ) do { \
|
||||||
|
yyextra->token = (tok); \
|
||||||
|
yyextra->type = (typ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define eat_until_eol() do { \
|
||||||
|
int c; \
|
||||||
|
while ((c = input(yyscanner)) != '\n' && c != EOF); \
|
||||||
|
if (c == EOF) \
|
||||||
|
yyterminate(); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define eat_until_unescaped(q) do { \
|
||||||
|
int c; \
|
||||||
|
while ((c = input(yyscanner)) != EOF) { \
|
||||||
|
if (c == '\n') \
|
||||||
|
break; \
|
||||||
|
if (c == '\\') { \
|
||||||
|
c = input(yyscanner); \
|
||||||
|
if (c == EOF) \
|
||||||
|
yyterminate(); \
|
||||||
|
} else if (c == q) \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
if (c == EOF) \
|
||||||
|
yyterminate(); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
%}
|
||||||
|
|
||||||
|
%option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy"
|
||||||
|
%x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment
|
||||||
|
|
||||||
|
%%
|
||||||
|
|
||||||
|
^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ {
|
||||||
|
const char *off = strrchr(yytext, ' ');
|
||||||
|
if (!off)
|
||||||
|
off = yytext;
|
||||||
|
else
|
||||||
|
++off;
|
||||||
|
feed_token(strdup(off), SHEBANG_TOKEN);
|
||||||
|
eat_until_eol();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
^#![ \t]*[[:alpha:]_\/]+ {
|
||||||
|
const char *off = strrchr(yytext, '/');
|
||||||
|
if (!off)
|
||||||
|
off = yytext;
|
||||||
|
else
|
||||||
|
++off;
|
||||||
|
if (strcmp(off, "env") == 0) {
|
||||||
|
eat_until_eol();
|
||||||
|
} else {
|
||||||
|
feed_token(strdup(off), SHEBANG_TOKEN);
|
||||||
|
eat_until_eol();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
^[ \t]*(\/\/|--|\#|%|\")" ".* { /* nothing */ }
|
||||||
|
|
||||||
|
"/*" { BEGIN(c_comment); }
|
||||||
|
/* See below for xml_comment start. */
|
||||||
|
"{-" { BEGIN(haskell_comment); }
|
||||||
|
"(*" { BEGIN(ocaml_comment); }
|
||||||
|
"\"\"\"" { BEGIN(python_dcomment); }
|
||||||
|
"'''" { BEGIN(python_scomment); }
|
||||||
|
|
||||||
|
<c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|\n { /* nothing */ }
|
||||||
|
<c_comment>"*/" { BEGIN(INITIAL); }
|
||||||
|
<xml_comment>"-->" { BEGIN(INITIAL); }
|
||||||
|
<haskell_comment>"-}" { BEGIN(INITIAL); }
|
||||||
|
<ocaml_comment>"*)" { BEGIN(INITIAL); }
|
||||||
|
<python_dcomment>"\"\"\"" { BEGIN(INITIAL); }
|
||||||
|
<python_scomment>"'''" { BEGIN(INITIAL); }
|
||||||
|
|
||||||
|
\"\"|'' { /* nothing */ }
|
||||||
|
\" { eat_until_unescaped('"'); }
|
||||||
|
' { eat_until_unescaped('\''); }
|
||||||
|
(0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
|
||||||
|
\<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}> {
|
||||||
|
if (strcmp(yytext, "<!--") == 0) {
|
||||||
|
BEGIN(xml_comment);
|
||||||
|
} else {
|
||||||
|
feed_token(strdup(yytext), SGML_TOKEN);
|
||||||
|
BEGIN(sgml);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
<sgml>[[:alnum:]_]+=/\" { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; }
|
||||||
|
<sgml>[[:alnum:]_]+=/' { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; }
|
||||||
|
<sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
|
||||||
|
<sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
|
||||||
|
<sgml>\> { BEGIN(INITIAL); }
|
||||||
|
<sgml>.|\n { /* nothing */ }
|
||||||
|
;|\{|\}|\(|\)|\[|\] { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
|
||||||
|
[[:alnum:]_.@#/*]+ {
|
||||||
|
if (strncmp(yytext, "/*", 2) == 0) {
|
||||||
|
if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) {
|
||||||
|
/* nothing */
|
||||||
|
} else {
|
||||||
|
BEGIN(c_comment);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
feed_token(strdup(yytext), REGULAR_TOKEN);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
\<\<?|\+|\-|\*|\/|%|&&?|\|\|? { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
|
||||||
|
.|\n { /* nothing */ }
|
||||||
|
|
||||||
|
%%
|
||||||
|
|
||||||
@@ -10,8 +10,9 @@ Gem::Specification.new do |s|
|
|||||||
s.homepage = "https://github.com/github/linguist"
|
s.homepage = "https://github.com/github/linguist"
|
||||||
s.license = "MIT"
|
s.license = "MIT"
|
||||||
|
|
||||||
s.files = Dir['lib/**/*'] + Dir['grammars/*'] + ['LICENSE']
|
s.files = Dir['lib/**/*'] + Dir['ext/**/*'] + Dir['grammars/*'] + ['LICENSE']
|
||||||
s.executables = ['linguist', 'git-linguist']
|
s.executables = ['linguist', 'git-linguist']
|
||||||
|
s.extensions = ['ext/linguist/extconf.rb']
|
||||||
|
|
||||||
s.add_dependency 'charlock_holmes', '~> 0.7.5'
|
s.add_dependency 'charlock_holmes', '~> 0.7.5'
|
||||||
s.add_dependency 'escape_utils', '~> 1.1.0'
|
s.add_dependency 'escape_utils', '~> 1.1.0'
|
||||||
@@ -19,6 +20,7 @@ Gem::Specification.new do |s|
|
|||||||
s.add_dependency 'rugged', '>= 0.25.1'
|
s.add_dependency 'rugged', '>= 0.25.1'
|
||||||
|
|
||||||
s.add_development_dependency 'minitest', '>= 5.0'
|
s.add_development_dependency 'minitest', '>= 5.0'
|
||||||
|
s.add_development_dependency 'rake-compiler', '~> 0.9'
|
||||||
s.add_development_dependency 'mocha'
|
s.add_development_dependency 'mocha'
|
||||||
s.add_development_dependency 'plist', '~>3.1'
|
s.add_development_dependency 'plist', '~>3.1'
|
||||||
s.add_development_dependency 'pry'
|
s.add_development_dependency 'pry'
|
||||||
|
|||||||
@@ -275,10 +275,8 @@ module Linguist
|
|||||||
# also--importantly--without having to duplicate many (potentially
|
# also--importantly--without having to duplicate many (potentially
|
||||||
# large) strings.
|
# large) strings.
|
||||||
begin
|
begin
|
||||||
encoded_newlines = ["\r\n", "\r", "\n"].
|
|
||||||
map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) }
|
|
||||||
|
|
||||||
data.split(Regexp.union(encoded_newlines), -1)
|
data.split(encoded_newlines_re, -1)
|
||||||
rescue Encoding::ConverterNotFoundError
|
rescue Encoding::ConverterNotFoundError
|
||||||
# The data is not splittable in the detected encoding. Assume it's
|
# The data is not splittable in the detected encoding. Assume it's
|
||||||
# one big line.
|
# one big line.
|
||||||
@@ -289,6 +287,51 @@ module Linguist
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def encoded_newlines_re
|
||||||
|
@encoded_newlines_re ||= Regexp.union(["\r\n", "\r", "\n"].
|
||||||
|
map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) })
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
def first_lines(n)
|
||||||
|
return lines[0...n] if defined? @lines
|
||||||
|
return [] unless viewable? && data
|
||||||
|
|
||||||
|
i, c = 0, 0
|
||||||
|
while c < n && j = data.index(encoded_newlines_re, i)
|
||||||
|
i = j + $&.length
|
||||||
|
c += 1
|
||||||
|
end
|
||||||
|
data[0...i].split(encoded_newlines_re, -1)
|
||||||
|
end
|
||||||
|
|
||||||
|
def last_lines(n)
|
||||||
|
if defined? @lines
|
||||||
|
if n >= @lines.length
|
||||||
|
@lines
|
||||||
|
else
|
||||||
|
lines[-n..-1]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return [] unless viewable? && data
|
||||||
|
|
||||||
|
no_eol = true
|
||||||
|
i, c = data.length, 0
|
||||||
|
k = i
|
||||||
|
while c < n && j = data.rindex(encoded_newlines_re, i - 1)
|
||||||
|
if c == 0 && j + $&.length == i
|
||||||
|
no_eol = false
|
||||||
|
n += 1
|
||||||
|
end
|
||||||
|
i = j
|
||||||
|
k = j + $&.length
|
||||||
|
c += 1
|
||||||
|
end
|
||||||
|
r = data[k..-1].split(encoded_newlines_re, -1)
|
||||||
|
r.pop if !no_eol
|
||||||
|
r
|
||||||
|
end
|
||||||
|
|
||||||
# Public: Get number of lines of code
|
# Public: Get number of lines of code
|
||||||
#
|
#
|
||||||
# Requires Blob#data
|
# Requires Blob#data
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ require 'linguist/tokenizer'
|
|||||||
module Linguist
|
module Linguist
|
||||||
# Language bayesian classifier.
|
# Language bayesian classifier.
|
||||||
class Classifier
|
class Classifier
|
||||||
|
CLASSIFIER_CONSIDER_BYTES = 50 * 1024
|
||||||
|
|
||||||
# Public: Use the classifier to detect language of the blob.
|
# Public: Use the classifier to detect language of the blob.
|
||||||
#
|
#
|
||||||
# blob - An object that quacks like a blob.
|
# blob - An object that quacks like a blob.
|
||||||
@@ -17,7 +19,7 @@ module Linguist
|
|||||||
# Returns an Array of Language objects, most probable first.
|
# Returns an Array of Language objects, most probable first.
|
||||||
def self.call(blob, possible_languages)
|
def self.call(blob, possible_languages)
|
||||||
language_names = possible_languages.map(&:name)
|
language_names = possible_languages.map(&:name)
|
||||||
classify(Samples.cache, blob.data, language_names).map do |name, _|
|
classify(Samples.cache, blob.data[0...CLASSIFIER_CONSIDER_BYTES], language_names).map do |name, _|
|
||||||
Language[name] # Return the actual Language objects
|
Language[name] # Return the actual Language objects
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -23,21 +23,21 @@ module Linguist
|
|||||||
#
|
#
|
||||||
# Returns a String like '100644'
|
# Returns a String like '100644'
|
||||||
def mode
|
def mode
|
||||||
File.stat(@fullpath).mode.to_s(8)
|
@mode ||= File.stat(@fullpath).mode.to_s(8)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Public: Read file contents.
|
# Public: Read file contents.
|
||||||
#
|
#
|
||||||
# Returns a String.
|
# Returns a String.
|
||||||
def data
|
def data
|
||||||
File.read(@fullpath)
|
@data ||= File.read(@fullpath)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Public: Get byte size
|
# Public: Get byte size
|
||||||
#
|
#
|
||||||
# Returns an Integer.
|
# Returns an Integer.
|
||||||
def size
|
def size
|
||||||
File.size(@fullpath)
|
@size ||= File.size(@fullpath)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
module Linguist
|
module Linguist
|
||||||
# A collection of simple heuristics that can be used to better analyze languages.
|
# A collection of simple heuristics that can be used to better analyze languages.
|
||||||
class Heuristics
|
class Heuristics
|
||||||
|
HEURISTICS_CONSIDER_BYTES = 50 * 1024
|
||||||
|
|
||||||
# Public: Use heuristics to detect language of the blob.
|
# Public: Use heuristics to detect language of the blob.
|
||||||
#
|
#
|
||||||
# blob - An object that quacks like a blob.
|
# blob - An object that quacks like a blob.
|
||||||
@@ -14,7 +16,7 @@ module Linguist
|
|||||||
#
|
#
|
||||||
# Returns an Array of languages, or empty if none matched or were inconclusive.
|
# Returns an Array of languages, or empty if none matched or were inconclusive.
|
||||||
def self.call(blob, candidates)
|
def self.call(blob, candidates)
|
||||||
data = blob.data
|
data = blob.data[0...HEURISTICS_CONSIDER_BYTES]
|
||||||
|
|
||||||
@heuristics.each do |heuristic|
|
@heuristics.each do |heuristic|
|
||||||
if heuristic.matches?(blob.name, candidates)
|
if heuristic.matches?(blob.name, candidates)
|
||||||
@@ -72,6 +74,14 @@ module Linguist
|
|||||||
|
|
||||||
# Common heuristics
|
# Common heuristics
|
||||||
ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/
|
ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/
|
||||||
|
CPlusPlusRegex = Regexp.union(
|
||||||
|
/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/,
|
||||||
|
/^\s*template\s*</,
|
||||||
|
/^[ \t]*try/,
|
||||||
|
/^[ \t]*catch\s*\(/,
|
||||||
|
/^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/,
|
||||||
|
/^[ \t]*(private|public|protected):$/,
|
||||||
|
/std::\w+/)
|
||||||
|
|
||||||
disambiguate ".as" do |data|
|
disambiguate ".as" do |data|
|
||||||
if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data)
|
if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data)
|
||||||
@@ -219,8 +229,7 @@ module Linguist
|
|||||||
disambiguate ".h" do |data|
|
disambiguate ".h" do |data|
|
||||||
if ObjectiveCRegex.match(data)
|
if ObjectiveCRegex.match(data)
|
||||||
Language["Objective-C"]
|
Language["Objective-C"]
|
||||||
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
|
elsif CPlusPlusRegex.match(data)
|
||||||
/^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data))
|
|
||||||
Language["C++"]
|
Language["C++"]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -109,8 +109,8 @@ module Linguist
|
|||||||
# Returns an Array with one Language if the blob has a Vim or Emacs modeline
|
# Returns an Array with one Language if the blob has a Vim or Emacs modeline
|
||||||
# that matches a Language name or alias. Returns an empty array if no match.
|
# that matches a Language name or alias. Returns an empty array if no match.
|
||||||
def self.call(blob, _ = nil)
|
def self.call(blob, _ = nil)
|
||||||
header = blob.lines.first(SEARCH_SCOPE).join("\n")
|
header = blob.first_lines(SEARCH_SCOPE).join("\n")
|
||||||
footer = blob.lines.last(SEARCH_SCOPE).join("\n")
|
footer = blob.last_lines(SEARCH_SCOPE).join("\n")
|
||||||
Array(Language.find_by_alias(modeline(header + footer)))
|
Array(Language.find_by_alias(modeline(header + footer)))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
require 'strscan'
|
require 'strscan'
|
||||||
|
require 'linguist/linguist'
|
||||||
|
|
||||||
module Linguist
|
module Linguist
|
||||||
# Generic programming language tokenizer.
|
# Generic programming language tokenizer.
|
||||||
@@ -15,191 +16,5 @@ module Linguist
|
|||||||
def self.tokenize(data)
|
def self.tokenize(data)
|
||||||
new.extract_tokens(data)
|
new.extract_tokens(data)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Read up to 100KB
|
|
||||||
BYTE_LIMIT = 100_000
|
|
||||||
|
|
||||||
# Start state on token, ignore anything till the next newline
|
|
||||||
SINGLE_LINE_COMMENTS = [
|
|
||||||
'//', # C
|
|
||||||
'--', # Ada, Haskell, AppleScript
|
|
||||||
'#', # Ruby
|
|
||||||
'%', # Tex
|
|
||||||
'"', # Vim
|
|
||||||
]
|
|
||||||
|
|
||||||
# Start state on opening token, ignore anything until the closing
|
|
||||||
# token is reached.
|
|
||||||
MULTI_LINE_COMMENTS = [
|
|
||||||
['/*', '*/'], # C
|
|
||||||
['<!--', '-->'], # XML
|
|
||||||
['{-', '-}'], # Haskell
|
|
||||||
['(*', '*)'], # Coq
|
|
||||||
['"""', '"""'], # Python
|
|
||||||
["'''", "'''"] # Python
|
|
||||||
]
|
|
||||||
|
|
||||||
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
|
|
||||||
"\s*#{Regexp.escape(c)} "
|
|
||||||
}.join("|"))
|
|
||||||
|
|
||||||
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
|
|
||||||
Regexp.escape(c[0])
|
|
||||||
}.join("|"))
|
|
||||||
|
|
||||||
# Internal: Extract generic tokens from data.
|
|
||||||
#
|
|
||||||
# data - String to scan.
|
|
||||||
#
|
|
||||||
# Examples
|
|
||||||
#
|
|
||||||
# extract_tokens("printf('Hello')")
|
|
||||||
# # => ['printf', '(', ')']
|
|
||||||
#
|
|
||||||
# Returns Array of token Strings.
|
|
||||||
def extract_tokens(data)
|
|
||||||
s = StringScanner.new(data)
|
|
||||||
|
|
||||||
tokens = []
|
|
||||||
until s.eos?
|
|
||||||
break if s.pos >= BYTE_LIMIT
|
|
||||||
|
|
||||||
if token = s.scan(/^#!.+$/)
|
|
||||||
if name = extract_shebang(token)
|
|
||||||
tokens << "SHEBANG#!#{name}"
|
|
||||||
end
|
|
||||||
|
|
||||||
# Single line comment
|
|
||||||
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
|
|
||||||
# tokens << token.strip
|
|
||||||
s.skip_until(/\n|\Z/)
|
|
||||||
|
|
||||||
# Multiline comments
|
|
||||||
elsif token = s.scan(START_MULTI_LINE_COMMENT)
|
|
||||||
# tokens << token
|
|
||||||
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
|
|
||||||
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
|
|
||||||
# tokens << close_token
|
|
||||||
|
|
||||||
# Skip single or double quoted strings
|
|
||||||
elsif s.scan(/"/)
|
|
||||||
if s.peek(1) == "\""
|
|
||||||
s.getch
|
|
||||||
else
|
|
||||||
s.skip_until(/(?<!\\)"/)
|
|
||||||
end
|
|
||||||
elsif s.scan(/'/)
|
|
||||||
if s.peek(1) == "'"
|
|
||||||
s.getch
|
|
||||||
else
|
|
||||||
s.skip_until(/(?<!\\)'/)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Skip number literals
|
|
||||||
elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
|
|
||||||
|
|
||||||
# SGML style brackets
|
|
||||||
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
|
||||||
extract_sgml_tokens(token).each { |t| tokens << t }
|
|
||||||
|
|
||||||
# Common programming punctuation
|
|
||||||
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
|
|
||||||
tokens << token
|
|
||||||
|
|
||||||
# Regular token
|
|
||||||
elsif token = s.scan(/[\w\.@#\/\*]+/)
|
|
||||||
tokens << token
|
|
||||||
|
|
||||||
# Common operators
|
|
||||||
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
|
|
||||||
tokens << token
|
|
||||||
|
|
||||||
else
|
|
||||||
s.getch
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
tokens
|
|
||||||
end
|
|
||||||
|
|
||||||
# Internal: Extract normalized shebang command token.
|
|
||||||
#
|
|
||||||
# Examples
|
|
||||||
#
|
|
||||||
# extract_shebang("#!/usr/bin/ruby")
|
|
||||||
# # => "ruby"
|
|
||||||
#
|
|
||||||
# extract_shebang("#!/usr/bin/env node")
|
|
||||||
# # => "node"
|
|
||||||
#
|
|
||||||
# extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
|
|
||||||
# # => "awk"
|
|
||||||
#
|
|
||||||
# Returns String token or nil it couldn't be parsed.
|
|
||||||
def extract_shebang(data)
|
|
||||||
s = StringScanner.new(data)
|
|
||||||
|
|
||||||
if path = s.scan(/^#!\s*\S+/)
|
|
||||||
script = path.split('/').last
|
|
||||||
if script == 'env'
|
|
||||||
s.scan(/\s+/)
|
|
||||||
s.scan(/.*=[^\s]+\s+/)
|
|
||||||
script = s.scan(/\S+/)
|
|
||||||
end
|
|
||||||
script = script[/[^\d]+/, 0] if script
|
|
||||||
return script
|
|
||||||
end
|
|
||||||
|
|
||||||
nil
|
|
||||||
end
|
|
||||||
|
|
||||||
# Internal: Extract tokens from inside SGML tag.
|
|
||||||
#
|
|
||||||
# data - SGML tag String.
|
|
||||||
#
|
|
||||||
# Examples
|
|
||||||
#
|
|
||||||
# extract_sgml_tokens("<a href='' class=foo>")
|
|
||||||
# # => ["<a>", "href="]
|
|
||||||
#
|
|
||||||
# Returns Array of token Strings.
|
|
||||||
def extract_sgml_tokens(data)
|
|
||||||
s = StringScanner.new(data)
|
|
||||||
|
|
||||||
tokens = []
|
|
||||||
|
|
||||||
until s.eos?
|
|
||||||
# Emit start token
|
|
||||||
if token = s.scan(/<\/?[^\s>]+/)
|
|
||||||
tokens << "#{token}>"
|
|
||||||
|
|
||||||
# Emit attributes with trailing =
|
|
||||||
elsif token = s.scan(/\w+=/)
|
|
||||||
tokens << token
|
|
||||||
|
|
||||||
# Then skip over attribute value
|
|
||||||
if s.scan(/"/)
|
|
||||||
s.skip_until(/[^\\]"/)
|
|
||||||
elsif s.scan(/'/)
|
|
||||||
s.skip_until(/[^\\]'/)
|
|
||||||
else
|
|
||||||
s.skip_until(/\w+/)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Emit lone attributes
|
|
||||||
elsif token = s.scan(/\w+/)
|
|
||||||
tokens << token
|
|
||||||
|
|
||||||
# Stop at the end of the tag
|
|
||||||
elsif s.scan(/>/)
|
|
||||||
s.terminate
|
|
||||||
|
|
||||||
else
|
|
||||||
s.getch
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
tokens
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user