mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			master
			...
			revert-384
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 0698b0f36e | 
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -8,6 +8,3 @@ lib/linguist/samples.json | |||||||
| /node_modules | /node_modules | ||||||
| test/fixtures/ace_modes.json | test/fixtures/ace_modes.json | ||||||
| /vendor/gems/ | /vendor/gems/ | ||||||
| /tmp |  | ||||||
| *.bundle |  | ||||||
| *.so |  | ||||||
|   | |||||||
							
								
								
									
										23
									
								
								Rakefile
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								Rakefile
									
									
									
									
									
								
							| @@ -1,7 +1,6 @@ | |||||||
| require 'bundler/setup' | require 'bundler/setup' | ||||||
| require 'rake/clean' | require 'rake/clean' | ||||||
| require 'rake/testtask' | require 'rake/testtask' | ||||||
| require 'rake/extensiontask' |  | ||||||
| require 'yaml' | require 'yaml' | ||||||
| require 'yajl' | require 'yajl' | ||||||
| require 'open-uri' | require 'open-uri' | ||||||
| @@ -11,14 +10,8 @@ task :default => :test | |||||||
|  |  | ||||||
| Rake::TestTask.new | Rake::TestTask.new | ||||||
|  |  | ||||||
| gem_spec = Gem::Specification.load('github-linguist.gemspec') |  | ||||||
|  |  | ||||||
| Rake::ExtensionTask.new('linguist', gem_spec) do |ext| |  | ||||||
|   ext.lib_dir = File.join('lib', 'linguist') |  | ||||||
| end |  | ||||||
|  |  | ||||||
| # Extend test task to check for samples and fetch latest Ace modes | # Extend test task to check for samples and fetch latest Ace modes | ||||||
| task :test => [:compile, :check_samples, :fetch_ace_modes] | task :test => [:check_samples, :fetch_ace_modes] | ||||||
|  |  | ||||||
| desc "Check that we have samples.json generated" | desc "Check that we have samples.json generated" | ||||||
| task :check_samples do | task :check_samples do | ||||||
| @@ -41,24 +34,12 @@ task :fetch_ace_modes do | |||||||
|   end |   end | ||||||
| end | end | ||||||
|  |  | ||||||
| task :samples => :compile do | task :samples do | ||||||
|   require 'linguist/samples' |   require 'linguist/samples' | ||||||
|   json = Yajl.dump(Linguist::Samples.data, :pretty => true) |   json = Yajl.dump(Linguist::Samples.data, :pretty => true) | ||||||
|   File.write 'lib/linguist/samples.json', json |   File.write 'lib/linguist/samples.json', json | ||||||
| end | end | ||||||
|  |  | ||||||
| FLEX_MIN_VER = [2, 5, 39] |  | ||||||
| task :flex do |  | ||||||
|   if `flex -V` !~ /^flex (\d+)\.(\d+)\.(\d+)/ |  | ||||||
|     fail "flex not detected" |  | ||||||
|   end |  | ||||||
|   maj, min, rev = $1.to_i, $2.to_i, $3.to_i |  | ||||||
|   if maj < FLEX_MIN_VER[0] || (maj == FLEX_MIN_VER[0] && (min < FLEX_MIN_VER[1] || (min == FLEX_MIN_VER[1] && rev < FLEX_MIN_VER[2]))) |  | ||||||
|     fail "building linguist's lexer requires at least flex #{FLEX_MIN_VER.join(".")}" |  | ||||||
|   end |  | ||||||
|   system "cd ext/linguist && flex tokenizer.l" |  | ||||||
| end |  | ||||||
|  |  | ||||||
| task :build_gem => :samples do | task :build_gem => :samples do | ||||||
|   rm_rf "grammars" |   rm_rf "grammars" | ||||||
|   sh "script/convert-grammars" |   sh "script/convert-grammars" | ||||||
|   | |||||||
| @@ -1,3 +0,0 @@ | |||||||
| require 'mkmf' |  | ||||||
| dir_config('linguist') |  | ||||||
| create_makefile('linguist/linguist') |  | ||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,353 +0,0 @@ | |||||||
| #ifndef linguist_yyHEADER_H |  | ||||||
| #define linguist_yyHEADER_H 1 |  | ||||||
| #define linguist_yyIN_HEADER 1 |  | ||||||
|  |  | ||||||
| #line 6 "lex.linguist_yy.h" |  | ||||||
|  |  | ||||||
| #define  YY_INT_ALIGNED short int |  | ||||||
|  |  | ||||||
| /* A lexical scanner generated by flex */ |  | ||||||
|  |  | ||||||
| #define FLEX_SCANNER |  | ||||||
| #define YY_FLEX_MAJOR_VERSION 2 |  | ||||||
| #define YY_FLEX_MINOR_VERSION 5 |  | ||||||
| #define YY_FLEX_SUBMINOR_VERSION 39 |  | ||||||
| #if YY_FLEX_SUBMINOR_VERSION > 0 |  | ||||||
| #define FLEX_BETA |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| /* First, we deal with  platform-specific or compiler-specific issues. */ |  | ||||||
|  |  | ||||||
| /* begin standard C headers. */ |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <string.h> |  | ||||||
| #include <errno.h> |  | ||||||
| #include <stdlib.h> |  | ||||||
|  |  | ||||||
| /* end standard C headers. */ |  | ||||||
|  |  | ||||||
| /* flex integer type definitions */ |  | ||||||
|  |  | ||||||
| #ifndef FLEXINT_H |  | ||||||
| #define FLEXINT_H |  | ||||||
|  |  | ||||||
| /* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ |  | ||||||
|  |  | ||||||
| #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L |  | ||||||
|  |  | ||||||
| /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, |  | ||||||
|  * if you want the limit (max/min) macros for int types.  |  | ||||||
|  */ |  | ||||||
| #ifndef __STDC_LIMIT_MACROS |  | ||||||
| #define __STDC_LIMIT_MACROS 1 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| typedef int8_t flex_int8_t; |  | ||||||
| typedef uint8_t flex_uint8_t; |  | ||||||
| typedef int16_t flex_int16_t; |  | ||||||
| typedef uint16_t flex_uint16_t; |  | ||||||
| typedef int32_t flex_int32_t; |  | ||||||
| typedef uint32_t flex_uint32_t; |  | ||||||
| #else |  | ||||||
| typedef signed char flex_int8_t; |  | ||||||
| typedef short int flex_int16_t; |  | ||||||
| typedef int flex_int32_t; |  | ||||||
| typedef unsigned char flex_uint8_t;  |  | ||||||
| typedef unsigned short int flex_uint16_t; |  | ||||||
| typedef unsigned int flex_uint32_t; |  | ||||||
|  |  | ||||||
| /* Limits of integral types. */ |  | ||||||
| #ifndef INT8_MIN |  | ||||||
| #define INT8_MIN               (-128) |  | ||||||
| #endif |  | ||||||
| #ifndef INT16_MIN |  | ||||||
| #define INT16_MIN              (-32767-1) |  | ||||||
| #endif |  | ||||||
| #ifndef INT32_MIN |  | ||||||
| #define INT32_MIN              (-2147483647-1) |  | ||||||
| #endif |  | ||||||
| #ifndef INT8_MAX |  | ||||||
| #define INT8_MAX               (127) |  | ||||||
| #endif |  | ||||||
| #ifndef INT16_MAX |  | ||||||
| #define INT16_MAX              (32767) |  | ||||||
| #endif |  | ||||||
| #ifndef INT32_MAX |  | ||||||
| #define INT32_MAX              (2147483647) |  | ||||||
| #endif |  | ||||||
| #ifndef UINT8_MAX |  | ||||||
| #define UINT8_MAX              (255U) |  | ||||||
| #endif |  | ||||||
| #ifndef UINT16_MAX |  | ||||||
| #define UINT16_MAX             (65535U) |  | ||||||
| #endif |  | ||||||
| #ifndef UINT32_MAX |  | ||||||
| #define UINT32_MAX             (4294967295U) |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #endif /* ! C99 */ |  | ||||||
|  |  | ||||||
| #endif /* ! FLEXINT_H */ |  | ||||||
|  |  | ||||||
| #ifdef __cplusplus |  | ||||||
|  |  | ||||||
| /* The "const" storage-class-modifier is valid. */ |  | ||||||
| #define YY_USE_CONST |  | ||||||
|  |  | ||||||
| #else	/* ! __cplusplus */ |  | ||||||
|  |  | ||||||
| /* C99 requires __STDC__ to be defined as 1. */ |  | ||||||
| #if defined (__STDC__) |  | ||||||
|  |  | ||||||
| #define YY_USE_CONST |  | ||||||
|  |  | ||||||
| #endif	/* defined (__STDC__) */ |  | ||||||
| #endif	/* ! __cplusplus */ |  | ||||||
|  |  | ||||||
| #ifdef YY_USE_CONST |  | ||||||
| #define yyconst const |  | ||||||
| #else |  | ||||||
| #define yyconst |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| /* An opaque pointer. */ |  | ||||||
| #ifndef YY_TYPEDEF_YY_SCANNER_T |  | ||||||
| #define YY_TYPEDEF_YY_SCANNER_T |  | ||||||
| typedef void* yyscan_t; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| /* For convenience, these vars (plus the bison vars far below) |  | ||||||
|    are macros in the reentrant scanner. */ |  | ||||||
| #define yyin yyg->yyin_r |  | ||||||
| #define yyout yyg->yyout_r |  | ||||||
| #define yyextra yyg->yyextra_r |  | ||||||
| #define yyleng yyg->yyleng_r |  | ||||||
| #define yytext yyg->yytext_r |  | ||||||
| #define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) |  | ||||||
| #define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) |  | ||||||
| #define yy_flex_debug yyg->yy_flex_debug_r |  | ||||||
|  |  | ||||||
| /* Size of default input buffer. */ |  | ||||||
| #ifndef YY_BUF_SIZE |  | ||||||
| #ifdef __ia64__ |  | ||||||
| /* On IA-64, the buffer size is 16k, not 8k. |  | ||||||
|  * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. |  | ||||||
|  * Ditto for the __ia64__ case accordingly. |  | ||||||
|  */ |  | ||||||
| #define YY_BUF_SIZE 32768 |  | ||||||
| #else |  | ||||||
| #define YY_BUF_SIZE 16384 |  | ||||||
| #endif /* __ia64__ */ |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #ifndef YY_TYPEDEF_YY_BUFFER_STATE |  | ||||||
| #define YY_TYPEDEF_YY_BUFFER_STATE |  | ||||||
| typedef struct yy_buffer_state *YY_BUFFER_STATE; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #ifndef YY_TYPEDEF_YY_SIZE_T |  | ||||||
| #define YY_TYPEDEF_YY_SIZE_T |  | ||||||
| typedef size_t yy_size_t; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #ifndef YY_STRUCT_YY_BUFFER_STATE |  | ||||||
| #define YY_STRUCT_YY_BUFFER_STATE |  | ||||||
| struct yy_buffer_state |  | ||||||
| 	{ |  | ||||||
| 	FILE *yy_input_file; |  | ||||||
|  |  | ||||||
| 	char *yy_ch_buf;		/* input buffer */ |  | ||||||
| 	char *yy_buf_pos;		/* current position in input buffer */ |  | ||||||
|  |  | ||||||
| 	/* Size of input buffer in bytes, not including room for EOB |  | ||||||
| 	 * characters. |  | ||||||
| 	 */ |  | ||||||
| 	yy_size_t yy_buf_size; |  | ||||||
|  |  | ||||||
| 	/* Number of characters read into yy_ch_buf, not including EOB |  | ||||||
| 	 * characters. |  | ||||||
| 	 */ |  | ||||||
| 	yy_size_t yy_n_chars; |  | ||||||
|  |  | ||||||
| 	/* Whether we "own" the buffer - i.e., we know we created it, |  | ||||||
| 	 * and can realloc() it to grow it, and should free() it to |  | ||||||
| 	 * delete it. |  | ||||||
| 	 */ |  | ||||||
| 	int yy_is_our_buffer; |  | ||||||
|  |  | ||||||
| 	/* Whether this is an "interactive" input source; if so, and |  | ||||||
| 	 * if we're using stdio for input, then we want to use getc() |  | ||||||
| 	 * instead of fread(), to make sure we stop fetching input after |  | ||||||
| 	 * each newline. |  | ||||||
| 	 */ |  | ||||||
| 	int yy_is_interactive; |  | ||||||
|  |  | ||||||
| 	/* Whether we're considered to be at the beginning of a line. |  | ||||||
| 	 * If so, '^' rules will be active on the next match, otherwise |  | ||||||
| 	 * not. |  | ||||||
| 	 */ |  | ||||||
| 	int yy_at_bol; |  | ||||||
|  |  | ||||||
|     int yy_bs_lineno; /**< The line count. */ |  | ||||||
|     int yy_bs_column; /**< The column count. */ |  | ||||||
|      |  | ||||||
| 	/* Whether to try to fill the input buffer when we reach the |  | ||||||
| 	 * end of it. |  | ||||||
| 	 */ |  | ||||||
| 	int yy_fill_buffer; |  | ||||||
|  |  | ||||||
| 	int yy_buffer_status; |  | ||||||
|  |  | ||||||
| 	}; |  | ||||||
| #endif /* !YY_STRUCT_YY_BUFFER_STATE */ |  | ||||||
|  |  | ||||||
| void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner ); |  | ||||||
| void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); |  | ||||||
| YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); |  | ||||||
| void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); |  | ||||||
| void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); |  | ||||||
| void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); |  | ||||||
| void linguist_yypop_buffer_state (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); |  | ||||||
| YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); |  | ||||||
| YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner ); |  | ||||||
| void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner ); |  | ||||||
| void linguist_yyfree (void * ,yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| /* Begin user sect3 */ |  | ||||||
|  |  | ||||||
| #define yytext_ptr yytext_r |  | ||||||
|  |  | ||||||
| #ifdef YY_HEADER_EXPORT_START_CONDITIONS |  | ||||||
| #define INITIAL 0 |  | ||||||
| #define sgml 1 |  | ||||||
| #define c_comment 2 |  | ||||||
| #define xml_comment 3 |  | ||||||
| #define haskell_comment 4 |  | ||||||
| #define ocaml_comment 5 |  | ||||||
| #define python_dcomment 6 |  | ||||||
| #define python_scomment 7 |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #ifndef YY_NO_UNISTD_H |  | ||||||
| /* Special case for "unistd.h", since it is non-ANSI. We include it way |  | ||||||
|  * down here because we want the user's section 1 to have been scanned first. |  | ||||||
|  * The user has a chance to override it with an option. |  | ||||||
|  */ |  | ||||||
| #include <unistd.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #define YY_EXTRA_TYPE struct tokenizer_extra * |  | ||||||
|  |  | ||||||
| int linguist_yylex_init (yyscan_t* scanner); |  | ||||||
|  |  | ||||||
| int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); |  | ||||||
|  |  | ||||||
| /* Accessor methods to globals. |  | ||||||
|    These are made visible to non-reentrant scanners for convenience. */ |  | ||||||
|  |  | ||||||
| int linguist_yylex_destroy (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| int linguist_yyget_debug (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| FILE *linguist_yyget_in (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| void linguist_yyset_in  (FILE * in_str ,yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| FILE *linguist_yyget_out (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| void linguist_yyset_out  (FILE * out_str ,yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| yy_size_t linguist_yyget_leng (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| char *linguist_yyget_text (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| int linguist_yyget_lineno (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| int linguist_yyget_column  (yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| void linguist_yyset_column (int column_no ,yyscan_t yyscanner ); |  | ||||||
|  |  | ||||||
| /* Macros after this point can all be overridden by user definitions in |  | ||||||
|  * section 1. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #ifndef YY_SKIP_YYWRAP |  | ||||||
| #ifdef __cplusplus |  | ||||||
| extern "C" int linguist_yywrap (yyscan_t yyscanner ); |  | ||||||
| #else |  | ||||||
| extern int linguist_yywrap (yyscan_t yyscanner ); |  | ||||||
| #endif |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #ifndef yytext_ptr |  | ||||||
| static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #ifdef YY_NEED_STRLEN |  | ||||||
| static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #ifndef YY_NO_INPUT |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| /* Amount of stuff to slurp up with each read. */ |  | ||||||
| #ifndef YY_READ_BUF_SIZE |  | ||||||
| #ifdef __ia64__ |  | ||||||
| /* On IA-64, the buffer size is 16k, not 8k */ |  | ||||||
| #define YY_READ_BUF_SIZE 16384 |  | ||||||
| #else |  | ||||||
| #define YY_READ_BUF_SIZE 8192 |  | ||||||
| #endif /* __ia64__ */ |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| /* Number of entries by which start-condition stack grows. */ |  | ||||||
| #ifndef YY_START_STACK_INCR |  | ||||||
| #define YY_START_STACK_INCR 25 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| /* Default declaration of generated scanner - a define so the user can |  | ||||||
|  * easily add parameters. |  | ||||||
|  */ |  | ||||||
| #ifndef YY_DECL |  | ||||||
| #define YY_DECL_IS_OURS 1 |  | ||||||
|  |  | ||||||
| extern int linguist_yylex (yyscan_t yyscanner); |  | ||||||
|  |  | ||||||
| #define YY_DECL int linguist_yylex (yyscan_t yyscanner) |  | ||||||
| #endif /* !YY_DECL */ |  | ||||||
|  |  | ||||||
| /* yy_get_previous_state - get the state just before the EOB char was reached */ |  | ||||||
|  |  | ||||||
| #undef YY_NEW_FILE |  | ||||||
| #undef YY_FLUSH_BUFFER |  | ||||||
| #undef yy_set_bol |  | ||||||
| #undef yy_new_buffer |  | ||||||
| #undef yy_set_interactive |  | ||||||
| #undef YY_DO_BEFORE_ACTION |  | ||||||
|  |  | ||||||
| #ifdef YY_DECL_IS_OURS |  | ||||||
| #undef YY_DECL_IS_OURS |  | ||||||
| #undef YY_DECL |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #line 117 "tokenizer.l" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #line 352 "lex.linguist_yy.h" |  | ||||||
| #undef linguist_yyIN_HEADER |  | ||||||
| #endif /* linguist_yyHEADER_H */ |  | ||||||
| @@ -1,64 +0,0 @@ | |||||||
| #include "ruby.h" |  | ||||||
| #include "linguist.h" |  | ||||||
| #include "lex.linguist_yy.h" |  | ||||||
|  |  | ||||||
| int linguist_yywrap(yyscan_t yyscanner) { |  | ||||||
| 	return 1; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| static VALUE rb_tokenizer_extract_tokens(VALUE self, VALUE rb_data) { |  | ||||||
| 	YY_BUFFER_STATE buf; |  | ||||||
| 	yyscan_t scanner; |  | ||||||
| 	struct tokenizer_extra extra; |  | ||||||
| 	VALUE ary, s; |  | ||||||
| 	long len; |  | ||||||
| 	int r; |  | ||||||
|  |  | ||||||
| 	Check_Type(rb_data, T_STRING); |  | ||||||
|  |  | ||||||
| 	len = RSTRING_LEN(rb_data); |  | ||||||
| 	if (len > 100000) |  | ||||||
| 		len = 100000; |  | ||||||
|  |  | ||||||
| 	linguist_yylex_init_extra(&extra, &scanner); |  | ||||||
| 	buf = linguist_yy_scan_bytes(RSTRING_PTR(rb_data), (int) len, scanner); |  | ||||||
|  |  | ||||||
| 	ary = rb_ary_new(); |  | ||||||
| 	do { |  | ||||||
| 		extra.type = NO_ACTION; |  | ||||||
| 		extra.token = NULL; |  | ||||||
| 		r = linguist_yylex(scanner); |  | ||||||
| 		switch (extra.type) { |  | ||||||
| 		case NO_ACTION: |  | ||||||
| 			break; |  | ||||||
| 		case REGULAR_TOKEN: |  | ||||||
| 			rb_ary_push(ary, rb_str_new2(extra.token)); |  | ||||||
| 			free(extra.token); |  | ||||||
| 			break; |  | ||||||
| 		case SHEBANG_TOKEN: |  | ||||||
| 			s = rb_str_new2("SHEBANG#!"); |  | ||||||
| 			rb_str_cat2(s, extra.token); |  | ||||||
| 			rb_ary_push(ary, s); |  | ||||||
| 			free(extra.token); |  | ||||||
| 			break; |  | ||||||
| 		case SGML_TOKEN: |  | ||||||
| 			s = rb_str_new2(extra.token); |  | ||||||
| 			rb_str_cat2(s, ">"); |  | ||||||
| 			rb_ary_push(ary, s); |  | ||||||
| 			free(extra.token); |  | ||||||
| 			break; |  | ||||||
| 		} |  | ||||||
| 	} while (r); |  | ||||||
|  |  | ||||||
| 	linguist_yy_delete_buffer(buf, scanner); |  | ||||||
| 	linguist_yylex_destroy(scanner); |  | ||||||
|  |  | ||||||
| 	return ary; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| __attribute__((visibility("default"))) void Init_linguist() { |  | ||||||
| 	VALUE rb_mLinguist = rb_define_module("Linguist"); |  | ||||||
| 	VALUE rb_cTokenizer = rb_define_class_under(rb_mLinguist, "Tokenizer", rb_cObject); |  | ||||||
|  |  | ||||||
| 	rb_define_method(rb_cTokenizer, "extract_tokens", rb_tokenizer_extract_tokens, 1); |  | ||||||
| } |  | ||||||
| @@ -1,11 +0,0 @@ | |||||||
| enum tokenizer_type { |  | ||||||
|   NO_ACTION, |  | ||||||
|   REGULAR_TOKEN, |  | ||||||
|   SHEBANG_TOKEN, |  | ||||||
|   SGML_TOKEN, |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| struct tokenizer_extra { |  | ||||||
|   char *token; |  | ||||||
|   enum tokenizer_type type; |  | ||||||
| }; |  | ||||||
| @@ -1,119 +0,0 @@ | |||||||
| %{ |  | ||||||
|  |  | ||||||
| #include "linguist.h" |  | ||||||
|  |  | ||||||
| #define feed_token(tok, typ) do { \ |  | ||||||
|     yyextra->token = (tok); \ |  | ||||||
|     yyextra->type = (typ); \ |  | ||||||
|   } while (0) |  | ||||||
|  |  | ||||||
| #define eat_until_eol() do { \ |  | ||||||
|     int c; \ |  | ||||||
|     while ((c = input(yyscanner)) != '\n' && c != EOF); \ |  | ||||||
|     if (c == EOF) \ |  | ||||||
|       yyterminate(); \ |  | ||||||
|   } while (0) |  | ||||||
|  |  | ||||||
| #define eat_until_unescaped(q) do { \ |  | ||||||
|     int c; \ |  | ||||||
|     while ((c = input(yyscanner)) != EOF) { \ |  | ||||||
|       if (c == '\n') \ |  | ||||||
|         break; \ |  | ||||||
|       if (c == '\\') { \ |  | ||||||
|         c = input(yyscanner); \ |  | ||||||
|         if (c == EOF) \ |  | ||||||
|           yyterminate(); \ |  | ||||||
|       } else if (c == q) \ |  | ||||||
|         break; \ |  | ||||||
|     } \ |  | ||||||
|     if (c == EOF) \ |  | ||||||
|       yyterminate(); \ |  | ||||||
|   } while (0) |  | ||||||
|  |  | ||||||
| %} |  | ||||||
|  |  | ||||||
| %option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy" |  | ||||||
| %x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment |  | ||||||
|  |  | ||||||
| %% |  | ||||||
|  |  | ||||||
| ^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ { |  | ||||||
|     const char *off = strrchr(yytext, ' '); |  | ||||||
|     if (!off) |  | ||||||
|       off = yytext; |  | ||||||
|     else |  | ||||||
|       ++off; |  | ||||||
|     feed_token(strdup(off), SHEBANG_TOKEN); |  | ||||||
|     eat_until_eol(); |  | ||||||
|     return 1; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| ^#![ \t]*[[:alpha:]_\/]+ { |  | ||||||
|     const char *off = strrchr(yytext, '/'); |  | ||||||
|     if (!off) |  | ||||||
|       off = yytext; |  | ||||||
|     else |  | ||||||
|       ++off; |  | ||||||
|     if (strcmp(off, "env") == 0) { |  | ||||||
|       eat_until_eol(); |  | ||||||
|     } else { |  | ||||||
|       feed_token(strdup(off), SHEBANG_TOKEN); |  | ||||||
|       eat_until_eol(); |  | ||||||
|       return 1; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| ^[ \t]*(\/\/|--|\#|%|\")" ".*   { /* nothing */ } |  | ||||||
|  |  | ||||||
| "/*"                              { BEGIN(c_comment); } |  | ||||||
|   /* See below for xml_comment start. */ |  | ||||||
| "{-"                              { BEGIN(haskell_comment); } |  | ||||||
| "(*"                              { BEGIN(ocaml_comment); } |  | ||||||
| "\"\"\""                          { BEGIN(python_dcomment); } |  | ||||||
| "'''"                             { BEGIN(python_scomment); } |  | ||||||
|  |  | ||||||
| <c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|\n { /* nothing */ } |  | ||||||
| <c_comment>"*/"                   { BEGIN(INITIAL); } |  | ||||||
| <xml_comment>"-->"                { BEGIN(INITIAL); } |  | ||||||
| <haskell_comment>"-}"             { BEGIN(INITIAL); } |  | ||||||
| <ocaml_comment>"*)"               { BEGIN(INITIAL); } |  | ||||||
| <python_dcomment>"\"\"\""         { BEGIN(INITIAL); } |  | ||||||
| <python_scomment>"'''"            { BEGIN(INITIAL); } |  | ||||||
|  |  | ||||||
| \"\"|''                           { /* nothing */ } |  | ||||||
| \"                                { eat_until_unescaped('"'); } |  | ||||||
| '                                 { eat_until_unescaped('\''); } |  | ||||||
| (0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ } |  | ||||||
| \<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}>               { |  | ||||||
|     if (strcmp(yytext, "<!--") == 0) { |  | ||||||
|      BEGIN(xml_comment); |  | ||||||
|     } else { |  | ||||||
|       feed_token(strdup(yytext), SGML_TOKEN); |  | ||||||
|       BEGIN(sgml); |  | ||||||
|       return 1; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| <sgml>[[:alnum:]_]+=/\"           { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; } |  | ||||||
| <sgml>[[:alnum:]_]+=/'            { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; } |  | ||||||
| <sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; } |  | ||||||
| <sgml>[[:alnum:]_]+               { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } |  | ||||||
| <sgml>\>                          { BEGIN(INITIAL); } |  | ||||||
| <sgml>.|\n                        { /* nothing */ } |  | ||||||
| ;|\{|\}|\(|\)|\[|\]               { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } |  | ||||||
| [[:alnum:]_.@#/*]+                { |  | ||||||
|     if (strncmp(yytext, "/*", 2) == 0) { |  | ||||||
|       if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) { |  | ||||||
|         /* nothing */ |  | ||||||
|       } else { |  | ||||||
|         BEGIN(c_comment); |  | ||||||
|       } |  | ||||||
|     } else { |  | ||||||
|       feed_token(strdup(yytext), REGULAR_TOKEN); |  | ||||||
|       return 1; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| \<\<?|\+|\-|\*|\/|%|&&?|\|\|?     { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } |  | ||||||
| .|\n                              { /* nothing */ } |  | ||||||
|  |  | ||||||
| %% |  | ||||||
|  |  | ||||||
| @@ -10,9 +10,8 @@ Gem::Specification.new do |s| | |||||||
|   s.homepage = "https://github.com/github/linguist" |   s.homepage = "https://github.com/github/linguist" | ||||||
|   s.license  = "MIT" |   s.license  = "MIT" | ||||||
|  |  | ||||||
|   s.files = Dir['lib/**/*'] + Dir['ext/**/*'] + Dir['grammars/*'] + ['LICENSE'] |   s.files = Dir['lib/**/*'] + Dir['grammars/*'] + ['LICENSE'] | ||||||
|   s.executables = ['linguist', 'git-linguist'] |   s.executables = ['linguist', 'git-linguist'] | ||||||
|   s.extensions = ['ext/linguist/extconf.rb'] |  | ||||||
|  |  | ||||||
|   s.add_dependency 'charlock_holmes', '~> 0.7.5' |   s.add_dependency 'charlock_holmes', '~> 0.7.5' | ||||||
|   s.add_dependency 'escape_utils',    '~> 1.1.0' |   s.add_dependency 'escape_utils',    '~> 1.1.0' | ||||||
| @@ -20,7 +19,6 @@ Gem::Specification.new do |s| | |||||||
|   s.add_dependency 'rugged',          '>= 0.25.1' |   s.add_dependency 'rugged',          '>= 0.25.1' | ||||||
|  |  | ||||||
|   s.add_development_dependency 'minitest', '>= 5.0' |   s.add_development_dependency 'minitest', '>= 5.0' | ||||||
|   s.add_development_dependency 'rake-compiler', '~> 0.9' |  | ||||||
|   s.add_development_dependency 'mocha' |   s.add_development_dependency 'mocha' | ||||||
|   s.add_development_dependency 'plist', '~>3.1' |   s.add_development_dependency 'plist', '~>3.1' | ||||||
|   s.add_development_dependency 'pry' |   s.add_development_dependency 'pry' | ||||||
|   | |||||||
| @@ -275,8 +275,10 @@ module Linguist | |||||||
|           # also--importantly--without having to duplicate many (potentially |           # also--importantly--without having to duplicate many (potentially | ||||||
|           # large) strings. |           # large) strings. | ||||||
|           begin |           begin | ||||||
|              |             encoded_newlines = ["\r\n", "\r", "\n"]. | ||||||
|             data.split(encoded_newlines_re, -1) |               map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) } | ||||||
|  |  | ||||||
|  |             data.split(Regexp.union(encoded_newlines), -1) | ||||||
|           rescue Encoding::ConverterNotFoundError |           rescue Encoding::ConverterNotFoundError | ||||||
|             # The data is not splittable in the detected encoding.  Assume it's |             # The data is not splittable in the detected encoding.  Assume it's | ||||||
|             # one big line. |             # one big line. | ||||||
| @@ -287,51 +289,6 @@ module Linguist | |||||||
|         end |         end | ||||||
|     end |     end | ||||||
|  |  | ||||||
|     def encoded_newlines_re |  | ||||||
|       @encoded_newlines_re ||= Regexp.union(["\r\n", "\r", "\n"]. |  | ||||||
|                                               map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) }) |  | ||||||
|  |  | ||||||
|     end |  | ||||||
|  |  | ||||||
|     def first_lines(n) |  | ||||||
|       return lines[0...n] if defined? @lines |  | ||||||
|       return [] unless viewable? && data |  | ||||||
|  |  | ||||||
|       i, c = 0, 0 |  | ||||||
|       while c < n && j = data.index(encoded_newlines_re, i) |  | ||||||
|         i = j + $&.length |  | ||||||
|         c += 1 |  | ||||||
|       end |  | ||||||
|       data[0...i].split(encoded_newlines_re, -1) |  | ||||||
|     end |  | ||||||
|  |  | ||||||
|     def last_lines(n) |  | ||||||
|       if defined? @lines |  | ||||||
|         if n >= @lines.length |  | ||||||
|           @lines |  | ||||||
|         else |  | ||||||
|           lines[-n..-1] |  | ||||||
|         end |  | ||||||
|       end |  | ||||||
|       return [] unless viewable? && data |  | ||||||
|  |  | ||||||
|       no_eol = true |  | ||||||
|       i, c = data.length, 0 |  | ||||||
|       k = i |  | ||||||
|       while c < n && j = data.rindex(encoded_newlines_re, i - 1) |  | ||||||
|         if c == 0 && j + $&.length == i |  | ||||||
|           no_eol = false |  | ||||||
|           n += 1 |  | ||||||
|         end |  | ||||||
|         i = j |  | ||||||
|         k = j + $&.length |  | ||||||
|         c += 1 |  | ||||||
|       end |  | ||||||
|       r = data[k..-1].split(encoded_newlines_re, -1) |  | ||||||
|       r.pop if !no_eol |  | ||||||
|       r |  | ||||||
|     end |  | ||||||
|  |  | ||||||
|     # Public: Get number of lines of code |     # Public: Get number of lines of code | ||||||
|     # |     # | ||||||
|     # Requires Blob#data |     # Requires Blob#data | ||||||
|   | |||||||
| @@ -3,8 +3,6 @@ require 'linguist/tokenizer' | |||||||
| module Linguist | module Linguist | ||||||
|   # Language bayesian classifier. |   # Language bayesian classifier. | ||||||
|   class Classifier |   class Classifier | ||||||
|     CLASSIFIER_CONSIDER_BYTES = 50 * 1024 |  | ||||||
|  |  | ||||||
|     # Public: Use the classifier to detect language of the blob. |     # Public: Use the classifier to detect language of the blob. | ||||||
|     # |     # | ||||||
|     # blob               - An object that quacks like a blob. |     # blob               - An object that quacks like a blob. | ||||||
| @@ -19,7 +17,7 @@ module Linguist | |||||||
|     # Returns an Array of Language objects, most probable first. |     # Returns an Array of Language objects, most probable first. | ||||||
|     def self.call(blob, possible_languages) |     def self.call(blob, possible_languages) | ||||||
|       language_names = possible_languages.map(&:name) |       language_names = possible_languages.map(&:name) | ||||||
|       classify(Samples.cache, blob.data[0...CLASSIFIER_CONSIDER_BYTES], language_names).map do |name, _| |       classify(Samples.cache, blob.data, language_names).map do |name, _| | ||||||
|         Language[name] # Return the actual Language objects |         Language[name] # Return the actual Language objects | ||||||
|       end |       end | ||||||
|     end |     end | ||||||
|   | |||||||
| @@ -23,21 +23,21 @@ module Linguist | |||||||
|     # |     # | ||||||
|     # Returns a String like '100644' |     # Returns a String like '100644' | ||||||
|     def mode |     def mode | ||||||
|       @mode ||= File.stat(@fullpath).mode.to_s(8) |       File.stat(@fullpath).mode.to_s(8) | ||||||
|     end |     end | ||||||
|  |  | ||||||
|     # Public: Read file contents. |     # Public: Read file contents. | ||||||
|     # |     # | ||||||
|     # Returns a String. |     # Returns a String. | ||||||
|     def data |     def data | ||||||
|       @data ||= File.read(@fullpath) |       File.read(@fullpath) | ||||||
|     end |     end | ||||||
|  |  | ||||||
|     # Public: Get byte size |     # Public: Get byte size | ||||||
|     # |     # | ||||||
|     # Returns an Integer. |     # Returns an Integer. | ||||||
|     def size |     def size | ||||||
|       @size ||= File.size(@fullpath) |       File.size(@fullpath) | ||||||
|     end |     end | ||||||
|   end |   end | ||||||
| end | end | ||||||
|   | |||||||
| @@ -1,8 +1,6 @@ | |||||||
| module Linguist | module Linguist | ||||||
|   # A collection of simple heuristics that can be used to better analyze languages. |   # A collection of simple heuristics that can be used to better analyze languages. | ||||||
|   class Heuristics |   class Heuristics | ||||||
|     HEURISTICS_CONSIDER_BYTES = 50 * 1024 |  | ||||||
|  |  | ||||||
|     # Public: Use heuristics to detect language of the blob. |     # Public: Use heuristics to detect language of the blob. | ||||||
|     # |     # | ||||||
|     # blob               - An object that quacks like a blob. |     # blob               - An object that quacks like a blob. | ||||||
| @@ -16,7 +14,7 @@ module Linguist | |||||||
|     # |     # | ||||||
|     # Returns an Array of languages, or empty if none matched or were inconclusive. |     # Returns an Array of languages, or empty if none matched or were inconclusive. | ||||||
|     def self.call(blob, candidates) |     def self.call(blob, candidates) | ||||||
|       data = blob.data[0...HEURISTICS_CONSIDER_BYTES] |       data = blob.data | ||||||
|  |  | ||||||
|       @heuristics.each do |heuristic| |       @heuristics.each do |heuristic| | ||||||
|         if heuristic.matches?(blob.name, candidates) |         if heuristic.matches?(blob.name, candidates) | ||||||
| @@ -74,14 +72,6 @@ module Linguist | |||||||
|  |  | ||||||
|     # Common heuristics |     # Common heuristics | ||||||
|     ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/ |     ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/ | ||||||
|     CPlusPlusRegex = Regexp.union( |  | ||||||
|         /^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/, |  | ||||||
|         /^\s*template\s*</, |  | ||||||
|         /^[ \t]*try/, |  | ||||||
|         /^[ \t]*catch\s*\(/, |  | ||||||
|         /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/, |  | ||||||
|         /^[ \t]*(private|public|protected):$/, |  | ||||||
|         /std::\w+/) |  | ||||||
|  |  | ||||||
|     disambiguate ".as" do |data| |     disambiguate ".as" do |data| | ||||||
|       if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data) |       if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data) | ||||||
| @@ -229,7 +219,8 @@ module Linguist | |||||||
|     disambiguate ".h" do |data| |     disambiguate ".h" do |data| | ||||||
|       if ObjectiveCRegex.match(data) |       if ObjectiveCRegex.match(data) | ||||||
|         Language["Objective-C"] |         Language["Objective-C"] | ||||||
|       elsif CPlusPlusRegex.match(data) |       elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) || | ||||||
|  |         /^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data)) | ||||||
|         Language["C++"] |         Language["C++"] | ||||||
|       end |       end | ||||||
|     end |     end | ||||||
|   | |||||||
| @@ -109,8 +109,8 @@ module Linguist | |||||||
|       # Returns an Array with one Language if the blob has a Vim or Emacs modeline |       # Returns an Array with one Language if the blob has a Vim or Emacs modeline | ||||||
|       # that matches a Language name or alias. Returns an empty array if no match. |       # that matches a Language name or alias. Returns an empty array if no match. | ||||||
|       def self.call(blob, _ = nil) |       def self.call(blob, _ = nil) | ||||||
|         header = blob.first_lines(SEARCH_SCOPE).join("\n") |         header = blob.lines.first(SEARCH_SCOPE).join("\n") | ||||||
|         footer = blob.last_lines(SEARCH_SCOPE).join("\n") |         footer = blob.lines.last(SEARCH_SCOPE).join("\n") | ||||||
|         Array(Language.find_by_alias(modeline(header + footer))) |         Array(Language.find_by_alias(modeline(header + footer))) | ||||||
|       end |       end | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,5 +1,4 @@ | |||||||
| require 'strscan' | require 'strscan' | ||||||
| require 'linguist/linguist' |  | ||||||
|  |  | ||||||
| module Linguist | module Linguist | ||||||
|   # Generic programming language tokenizer. |   # Generic programming language tokenizer. | ||||||
| @@ -16,5 +15,191 @@ module Linguist | |||||||
|     def self.tokenize(data) |     def self.tokenize(data) | ||||||
|       new.extract_tokens(data) |       new.extract_tokens(data) | ||||||
|     end |     end | ||||||
|  |  | ||||||
|  |     # Read up to 100KB | ||||||
|  |     BYTE_LIMIT = 100_000 | ||||||
|  |  | ||||||
|  |     # Start state on token, ignore anything till the next newline | ||||||
|  |     SINGLE_LINE_COMMENTS = [ | ||||||
|  |       '//', # C | ||||||
|  |       '--', # Ada, Haskell, AppleScript | ||||||
|  |       '#',  # Ruby | ||||||
|  |       '%',  # Tex | ||||||
|  |       '"',  # Vim | ||||||
|  |     ] | ||||||
|  |  | ||||||
|  |     # Start state on opening token, ignore anything until the closing | ||||||
|  |     # token is reached. | ||||||
|  |     MULTI_LINE_COMMENTS = [ | ||||||
|  |       ['/*', '*/'],    # C | ||||||
|  |       ['<!--', '-->'], # XML | ||||||
|  |       ['{-', '-}'],    # Haskell | ||||||
|  |       ['(*', '*)'],    # Coq | ||||||
|  |       ['"""', '"""'],  # Python | ||||||
|  |       ["'''", "'''"]   # Python | ||||||
|  |     ] | ||||||
|  |  | ||||||
|  |     START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c| | ||||||
|  |       "\s*#{Regexp.escape(c)} " | ||||||
|  |     }.join("|")) | ||||||
|  |  | ||||||
|  |     START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c| | ||||||
|  |       Regexp.escape(c[0]) | ||||||
|  |     }.join("|")) | ||||||
|  |  | ||||||
|  |     # Internal: Extract generic tokens from data. | ||||||
|  |     # | ||||||
|  |     # data - String to scan. | ||||||
|  |     # | ||||||
|  |     # Examples | ||||||
|  |     # | ||||||
|  |     #   extract_tokens("printf('Hello')") | ||||||
|  |     #   # => ['printf', '(', ')'] | ||||||
|  |     # | ||||||
|  |     # Returns Array of token Strings. | ||||||
|  |     def extract_tokens(data) | ||||||
|  |       s = StringScanner.new(data) | ||||||
|  |  | ||||||
|  |       tokens = [] | ||||||
|  |       until s.eos? | ||||||
|  |         break if s.pos >= BYTE_LIMIT | ||||||
|  |  | ||||||
|  |         if token = s.scan(/^#!.+$/) | ||||||
|  |           if name = extract_shebang(token) | ||||||
|  |             tokens << "SHEBANG#!#{name}" | ||||||
|  |           end | ||||||
|  |  | ||||||
|  |         # Single line comment | ||||||
|  |         elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT) | ||||||
|  |           # tokens << token.strip | ||||||
|  |           s.skip_until(/\n|\Z/) | ||||||
|  |  | ||||||
|  |         # Multiline comments | ||||||
|  |         elsif token = s.scan(START_MULTI_LINE_COMMENT) | ||||||
|  |           # tokens << token | ||||||
|  |           close_token = MULTI_LINE_COMMENTS.assoc(token)[1] | ||||||
|  |           s.skip_until(Regexp.compile(Regexp.escape(close_token))) | ||||||
|  |           # tokens << close_token | ||||||
|  |  | ||||||
|  |         # Skip single or double quoted strings | ||||||
|  |         elsif s.scan(/"/) | ||||||
|  |           if s.peek(1) == "\"" | ||||||
|  |             s.getch | ||||||
|  |           else | ||||||
|  |             s.skip_until(/(?<!\\)"/) | ||||||
|  |           end | ||||||
|  |         elsif s.scan(/'/) | ||||||
|  |           if s.peek(1) == "'" | ||||||
|  |             s.getch | ||||||
|  |           else | ||||||
|  |             s.skip_until(/(?<!\\)'/) | ||||||
|  |           end | ||||||
|  |  | ||||||
|  |         # Skip number literals | ||||||
|  |         elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/) | ||||||
|  |  | ||||||
|  |         # SGML style brackets | ||||||
|  |         elsif token = s.scan(/<[^\s<>][^<>]*>/) | ||||||
|  |           extract_sgml_tokens(token).each { |t| tokens << t } | ||||||
|  |  | ||||||
|  |         # Common programming punctuation | ||||||
|  |         elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/) | ||||||
|  |           tokens << token | ||||||
|  |  | ||||||
|  |         # Regular token | ||||||
|  |         elsif token = s.scan(/[\w\.@#\/\*]+/) | ||||||
|  |           tokens << token | ||||||
|  |  | ||||||
|  |         # Common operators | ||||||
|  |         elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/) | ||||||
|  |           tokens << token | ||||||
|  |  | ||||||
|  |         else | ||||||
|  |           s.getch | ||||||
|  |         end | ||||||
|  |       end | ||||||
|  |  | ||||||
|  |       tokens | ||||||
|  |     end | ||||||
|  |  | ||||||
|  |     # Internal: Extract normalized shebang command token. | ||||||
|  |     # | ||||||
|  |     # Examples | ||||||
|  |     # | ||||||
|  |     #   extract_shebang("#!/usr/bin/ruby") | ||||||
|  |     #   # => "ruby" | ||||||
|  |     # | ||||||
|  |     #   extract_shebang("#!/usr/bin/env node") | ||||||
|  |     #   # => "node" | ||||||
|  |     # | ||||||
|  |     #   extract_shebang("#!/usr/bin/env A=B foo=bar awk -f") | ||||||
|  |     #   # => "awk" | ||||||
|  |     # | ||||||
|  |     # Returns String token or nil it couldn't be parsed. | ||||||
|  |     def extract_shebang(data) | ||||||
|  |       s = StringScanner.new(data) | ||||||
|  |  | ||||||
|  |       if path = s.scan(/^#!\s*\S+/) | ||||||
|  |         script = path.split('/').last | ||||||
|  |         if script == 'env' | ||||||
|  |           s.scan(/\s+/) | ||||||
|  |           s.scan(/.*=[^\s]+\s+/) | ||||||
|  |           script = s.scan(/\S+/) | ||||||
|  |         end | ||||||
|  |         script = script[/[^\d]+/, 0] if script | ||||||
|  |         return script | ||||||
|  |       end | ||||||
|  |  | ||||||
|  |       nil | ||||||
|  |     end | ||||||
|  |  | ||||||
|  |     # Internal: Extract tokens from inside SGML tag. | ||||||
|  |     # | ||||||
|  |     # data - SGML tag String. | ||||||
|  |     # | ||||||
|  |     # Examples | ||||||
|  |     # | ||||||
|  |     #   extract_sgml_tokens("<a href='' class=foo>") | ||||||
|  |     #   # => ["<a>", "href="] | ||||||
|  |     # | ||||||
|  |     # Returns Array of token Strings. | ||||||
|  |     def extract_sgml_tokens(data) | ||||||
|  |       s = StringScanner.new(data) | ||||||
|  |  | ||||||
|  |       tokens = [] | ||||||
|  |  | ||||||
|  |       until s.eos? | ||||||
|  |         # Emit start token | ||||||
|  |         if token = s.scan(/<\/?[^\s>]+/) | ||||||
|  |           tokens << "#{token}>" | ||||||
|  |  | ||||||
|  |         # Emit attributes with trailing = | ||||||
|  |         elsif token = s.scan(/\w+=/) | ||||||
|  |           tokens << token | ||||||
|  |  | ||||||
|  |           # Then skip over attribute value | ||||||
|  |           if s.scan(/"/) | ||||||
|  |             s.skip_until(/[^\\]"/) | ||||||
|  |           elsif s.scan(/'/) | ||||||
|  |             s.skip_until(/[^\\]'/) | ||||||
|  |           else | ||||||
|  |             s.skip_until(/\w+/) | ||||||
|  |           end | ||||||
|  |  | ||||||
|  |         # Emit lone attributes | ||||||
|  |         elsif token = s.scan(/\w+/) | ||||||
|  |           tokens << token | ||||||
|  |  | ||||||
|  |         # Stop at the end of the tag | ||||||
|  |         elsif s.scan(/>/) | ||||||
|  |           s.terminate | ||||||
|  |  | ||||||
|  |         else | ||||||
|  |           s.getch | ||||||
|  |         end | ||||||
|  |       end | ||||||
|  |  | ||||||
|  |       tokens | ||||||
|  |     end | ||||||
|   end |   end | ||||||
| end | end | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user