mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			vmg/empty-
			...
			revert-384
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 0698b0f36e | 
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -8,6 +8,3 @@ lib/linguist/samples.json | ||||
| /node_modules | ||||
| test/fixtures/ace_modes.json | ||||
| /vendor/gems/ | ||||
| /tmp | ||||
| *.bundle | ||||
| *.so | ||||
|   | ||||
							
								
								
									
										23
									
								
								Rakefile
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								Rakefile
									
									
									
									
									
								
							| @@ -1,7 +1,6 @@ | ||||
| require 'bundler/setup' | ||||
| require 'rake/clean' | ||||
| require 'rake/testtask' | ||||
| require 'rake/extensiontask' | ||||
| require 'yaml' | ||||
| require 'yajl' | ||||
| require 'open-uri' | ||||
| @@ -11,14 +10,8 @@ task :default => :test | ||||
|  | ||||
| Rake::TestTask.new | ||||
|  | ||||
| gem_spec = Gem::Specification.load('github-linguist.gemspec') | ||||
|  | ||||
| Rake::ExtensionTask.new('linguist', gem_spec) do |ext| | ||||
|   ext.lib_dir = File.join('lib', 'linguist') | ||||
| end | ||||
|  | ||||
| # Extend test task to check for samples and fetch latest Ace modes | ||||
| task :test => [:compile, :check_samples, :fetch_ace_modes] | ||||
| task :test => [:check_samples, :fetch_ace_modes] | ||||
|  | ||||
| desc "Check that we have samples.json generated" | ||||
| task :check_samples do | ||||
| @@ -41,24 +34,12 @@ task :fetch_ace_modes do | ||||
|   end | ||||
| end | ||||
|  | ||||
| task :samples => :compile do | ||||
| task :samples do | ||||
|   require 'linguist/samples' | ||||
|   json = Yajl.dump(Linguist::Samples.data, :pretty => true) | ||||
|   File.write 'lib/linguist/samples.json', json | ||||
| end | ||||
|  | ||||
| FLEX_MIN_VER = [2, 5, 39] | ||||
| task :flex do | ||||
|   if `flex -V` !~ /^flex (\d+)\.(\d+)\.(\d+)/ | ||||
|     fail "flex not detected" | ||||
|   end | ||||
|   maj, min, rev = $1.to_i, $2.to_i, $3.to_i | ||||
|   if maj < FLEX_MIN_VER[0] || (maj == FLEX_MIN_VER[0] && (min < FLEX_MIN_VER[1] || (min == FLEX_MIN_VER[1] && rev < FLEX_MIN_VER[2]))) | ||||
|     fail "building linguist's lexer requires at least flex #{FLEX_MIN_VER.join(".")}" | ||||
|   end | ||||
|   system "cd ext/linguist && flex tokenizer.l" | ||||
| end | ||||
|  | ||||
| task :build_gem => :samples do | ||||
|   rm_rf "grammars" | ||||
|   sh "script/convert-grammars" | ||||
|   | ||||
| @@ -1,3 +0,0 @@ | ||||
| require 'mkmf' | ||||
| dir_config('linguist') | ||||
| create_makefile('linguist/linguist') | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,353 +0,0 @@ | ||||
| #ifndef linguist_yyHEADER_H | ||||
| #define linguist_yyHEADER_H 1 | ||||
| #define linguist_yyIN_HEADER 1 | ||||
|  | ||||
| #line 6 "lex.linguist_yy.h" | ||||
|  | ||||
| #define  YY_INT_ALIGNED short int | ||||
|  | ||||
| /* A lexical scanner generated by flex */ | ||||
|  | ||||
| #define FLEX_SCANNER | ||||
| #define YY_FLEX_MAJOR_VERSION 2 | ||||
| #define YY_FLEX_MINOR_VERSION 5 | ||||
| #define YY_FLEX_SUBMINOR_VERSION 39 | ||||
| #if YY_FLEX_SUBMINOR_VERSION > 0 | ||||
| #define FLEX_BETA | ||||
| #endif | ||||
|  | ||||
| /* First, we deal with  platform-specific or compiler-specific issues. */ | ||||
|  | ||||
| /* begin standard C headers. */ | ||||
| #include <stdio.h> | ||||
| #include <string.h> | ||||
| #include <errno.h> | ||||
| #include <stdlib.h> | ||||
|  | ||||
| /* end standard C headers. */ | ||||
|  | ||||
| /* flex integer type definitions */ | ||||
|  | ||||
| #ifndef FLEXINT_H | ||||
| #define FLEXINT_H | ||||
|  | ||||
| /* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ | ||||
|  | ||||
| #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L | ||||
|  | ||||
| /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, | ||||
|  * if you want the limit (max/min) macros for int types.  | ||||
|  */ | ||||
| #ifndef __STDC_LIMIT_MACROS | ||||
| #define __STDC_LIMIT_MACROS 1 | ||||
| #endif | ||||
|  | ||||
| #include <inttypes.h> | ||||
| typedef int8_t flex_int8_t; | ||||
| typedef uint8_t flex_uint8_t; | ||||
| typedef int16_t flex_int16_t; | ||||
| typedef uint16_t flex_uint16_t; | ||||
| typedef int32_t flex_int32_t; | ||||
| typedef uint32_t flex_uint32_t; | ||||
| #else | ||||
| typedef signed char flex_int8_t; | ||||
| typedef short int flex_int16_t; | ||||
| typedef int flex_int32_t; | ||||
| typedef unsigned char flex_uint8_t;  | ||||
| typedef unsigned short int flex_uint16_t; | ||||
| typedef unsigned int flex_uint32_t; | ||||
|  | ||||
| /* Limits of integral types. */ | ||||
| #ifndef INT8_MIN | ||||
| #define INT8_MIN               (-128) | ||||
| #endif | ||||
| #ifndef INT16_MIN | ||||
| #define INT16_MIN              (-32767-1) | ||||
| #endif | ||||
| #ifndef INT32_MIN | ||||
| #define INT32_MIN              (-2147483647-1) | ||||
| #endif | ||||
| #ifndef INT8_MAX | ||||
| #define INT8_MAX               (127) | ||||
| #endif | ||||
| #ifndef INT16_MAX | ||||
| #define INT16_MAX              (32767) | ||||
| #endif | ||||
| #ifndef INT32_MAX | ||||
| #define INT32_MAX              (2147483647) | ||||
| #endif | ||||
| #ifndef UINT8_MAX | ||||
| #define UINT8_MAX              (255U) | ||||
| #endif | ||||
| #ifndef UINT16_MAX | ||||
| #define UINT16_MAX             (65535U) | ||||
| #endif | ||||
| #ifndef UINT32_MAX | ||||
| #define UINT32_MAX             (4294967295U) | ||||
| #endif | ||||
|  | ||||
| #endif /* ! C99 */ | ||||
|  | ||||
| #endif /* ! FLEXINT_H */ | ||||
|  | ||||
| #ifdef __cplusplus | ||||
|  | ||||
| /* The "const" storage-class-modifier is valid. */ | ||||
| #define YY_USE_CONST | ||||
|  | ||||
| #else	/* ! __cplusplus */ | ||||
|  | ||||
| /* C99 requires __STDC__ to be defined as 1. */ | ||||
| #if defined (__STDC__) | ||||
|  | ||||
| #define YY_USE_CONST | ||||
|  | ||||
| #endif	/* defined (__STDC__) */ | ||||
| #endif	/* ! __cplusplus */ | ||||
|  | ||||
| #ifdef YY_USE_CONST | ||||
| #define yyconst const | ||||
| #else | ||||
| #define yyconst | ||||
| #endif | ||||
|  | ||||
| /* An opaque pointer. */ | ||||
| #ifndef YY_TYPEDEF_YY_SCANNER_T | ||||
| #define YY_TYPEDEF_YY_SCANNER_T | ||||
| typedef void* yyscan_t; | ||||
| #endif | ||||
|  | ||||
| /* For convenience, these vars (plus the bison vars far below) | ||||
|    are macros in the reentrant scanner. */ | ||||
| #define yyin yyg->yyin_r | ||||
| #define yyout yyg->yyout_r | ||||
| #define yyextra yyg->yyextra_r | ||||
| #define yyleng yyg->yyleng_r | ||||
| #define yytext yyg->yytext_r | ||||
| #define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) | ||||
| #define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) | ||||
| #define yy_flex_debug yyg->yy_flex_debug_r | ||||
|  | ||||
| /* Size of default input buffer. */ | ||||
| #ifndef YY_BUF_SIZE | ||||
| #ifdef __ia64__ | ||||
| /* On IA-64, the buffer size is 16k, not 8k. | ||||
|  * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. | ||||
|  * Ditto for the __ia64__ case accordingly. | ||||
|  */ | ||||
| #define YY_BUF_SIZE 32768 | ||||
| #else | ||||
| #define YY_BUF_SIZE 16384 | ||||
| #endif /* __ia64__ */ | ||||
| #endif | ||||
|  | ||||
| #ifndef YY_TYPEDEF_YY_BUFFER_STATE | ||||
| #define YY_TYPEDEF_YY_BUFFER_STATE | ||||
| typedef struct yy_buffer_state *YY_BUFFER_STATE; | ||||
| #endif | ||||
|  | ||||
| #ifndef YY_TYPEDEF_YY_SIZE_T | ||||
| #define YY_TYPEDEF_YY_SIZE_T | ||||
| typedef size_t yy_size_t; | ||||
| #endif | ||||
|  | ||||
| #ifndef YY_STRUCT_YY_BUFFER_STATE | ||||
| #define YY_STRUCT_YY_BUFFER_STATE | ||||
| struct yy_buffer_state | ||||
| 	{ | ||||
| 	FILE *yy_input_file; | ||||
|  | ||||
| 	char *yy_ch_buf;		/* input buffer */ | ||||
| 	char *yy_buf_pos;		/* current position in input buffer */ | ||||
|  | ||||
| 	/* Size of input buffer in bytes, not including room for EOB | ||||
| 	 * characters. | ||||
| 	 */ | ||||
| 	yy_size_t yy_buf_size; | ||||
|  | ||||
| 	/* Number of characters read into yy_ch_buf, not including EOB | ||||
| 	 * characters. | ||||
| 	 */ | ||||
| 	yy_size_t yy_n_chars; | ||||
|  | ||||
| 	/* Whether we "own" the buffer - i.e., we know we created it, | ||||
| 	 * and can realloc() it to grow it, and should free() it to | ||||
| 	 * delete it. | ||||
| 	 */ | ||||
| 	int yy_is_our_buffer; | ||||
|  | ||||
| 	/* Whether this is an "interactive" input source; if so, and | ||||
| 	 * if we're using stdio for input, then we want to use getc() | ||||
| 	 * instead of fread(), to make sure we stop fetching input after | ||||
| 	 * each newline. | ||||
| 	 */ | ||||
| 	int yy_is_interactive; | ||||
|  | ||||
| 	/* Whether we're considered to be at the beginning of a line. | ||||
| 	 * If so, '^' rules will be active on the next match, otherwise | ||||
| 	 * not. | ||||
| 	 */ | ||||
| 	int yy_at_bol; | ||||
|  | ||||
|     int yy_bs_lineno; /**< The line count. */ | ||||
|     int yy_bs_column; /**< The column count. */ | ||||
|      | ||||
| 	/* Whether to try to fill the input buffer when we reach the | ||||
| 	 * end of it. | ||||
| 	 */ | ||||
| 	int yy_fill_buffer; | ||||
|  | ||||
| 	int yy_buffer_status; | ||||
|  | ||||
| 	}; | ||||
| #endif /* !YY_STRUCT_YY_BUFFER_STATE */ | ||||
|  | ||||
| void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner ); | ||||
| void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); | ||||
| YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); | ||||
| void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); | ||||
| void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); | ||||
| void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); | ||||
| void linguist_yypop_buffer_state (yyscan_t yyscanner ); | ||||
|  | ||||
| YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); | ||||
| YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); | ||||
| YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner ); | ||||
|  | ||||
| void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner ); | ||||
| void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner ); | ||||
| void linguist_yyfree (void * ,yyscan_t yyscanner ); | ||||
|  | ||||
| /* Begin user sect3 */ | ||||
|  | ||||
| #define yytext_ptr yytext_r | ||||
|  | ||||
| #ifdef YY_HEADER_EXPORT_START_CONDITIONS | ||||
| #define INITIAL 0 | ||||
| #define sgml 1 | ||||
| #define c_comment 2 | ||||
| #define xml_comment 3 | ||||
| #define haskell_comment 4 | ||||
| #define ocaml_comment 5 | ||||
| #define python_dcomment 6 | ||||
| #define python_scomment 7 | ||||
|  | ||||
| #endif | ||||
|  | ||||
| #ifndef YY_NO_UNISTD_H | ||||
| /* Special case for "unistd.h", since it is non-ANSI. We include it way | ||||
|  * down here because we want the user's section 1 to have been scanned first. | ||||
|  * The user has a chance to override it with an option. | ||||
|  */ | ||||
| #include <unistd.h> | ||||
| #endif | ||||
|  | ||||
| #define YY_EXTRA_TYPE struct tokenizer_extra * | ||||
|  | ||||
| int linguist_yylex_init (yyscan_t* scanner); | ||||
|  | ||||
| int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); | ||||
|  | ||||
| /* Accessor methods to globals. | ||||
|    These are made visible to non-reentrant scanners for convenience. */ | ||||
|  | ||||
| int linguist_yylex_destroy (yyscan_t yyscanner ); | ||||
|  | ||||
| int linguist_yyget_debug (yyscan_t yyscanner ); | ||||
|  | ||||
| void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner ); | ||||
|  | ||||
| YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner ); | ||||
|  | ||||
| void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); | ||||
|  | ||||
| FILE *linguist_yyget_in (yyscan_t yyscanner ); | ||||
|  | ||||
| void linguist_yyset_in  (FILE * in_str ,yyscan_t yyscanner ); | ||||
|  | ||||
| FILE *linguist_yyget_out (yyscan_t yyscanner ); | ||||
|  | ||||
| void linguist_yyset_out  (FILE * out_str ,yyscan_t yyscanner ); | ||||
|  | ||||
| yy_size_t linguist_yyget_leng (yyscan_t yyscanner ); | ||||
|  | ||||
| char *linguist_yyget_text (yyscan_t yyscanner ); | ||||
|  | ||||
| int linguist_yyget_lineno (yyscan_t yyscanner ); | ||||
|  | ||||
| void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner ); | ||||
|  | ||||
| int linguist_yyget_column  (yyscan_t yyscanner ); | ||||
|  | ||||
| void linguist_yyset_column (int column_no ,yyscan_t yyscanner ); | ||||
|  | ||||
| /* Macros after this point can all be overridden by user definitions in | ||||
|  * section 1. | ||||
|  */ | ||||
|  | ||||
| #ifndef YY_SKIP_YYWRAP | ||||
| #ifdef __cplusplus | ||||
| extern "C" int linguist_yywrap (yyscan_t yyscanner ); | ||||
| #else | ||||
| extern int linguist_yywrap (yyscan_t yyscanner ); | ||||
| #endif | ||||
| #endif | ||||
|  | ||||
| #ifndef yytext_ptr | ||||
| static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); | ||||
| #endif | ||||
|  | ||||
| #ifdef YY_NEED_STRLEN | ||||
| static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); | ||||
| #endif | ||||
|  | ||||
| #ifndef YY_NO_INPUT | ||||
|  | ||||
| #endif | ||||
|  | ||||
| /* Amount of stuff to slurp up with each read. */ | ||||
| #ifndef YY_READ_BUF_SIZE | ||||
| #ifdef __ia64__ | ||||
| /* On IA-64, the buffer size is 16k, not 8k */ | ||||
| #define YY_READ_BUF_SIZE 16384 | ||||
| #else | ||||
| #define YY_READ_BUF_SIZE 8192 | ||||
| #endif /* __ia64__ */ | ||||
| #endif | ||||
|  | ||||
| /* Number of entries by which start-condition stack grows. */ | ||||
| #ifndef YY_START_STACK_INCR | ||||
| #define YY_START_STACK_INCR 25 | ||||
| #endif | ||||
|  | ||||
| /* Default declaration of generated scanner - a define so the user can | ||||
|  * easily add parameters. | ||||
|  */ | ||||
| #ifndef YY_DECL | ||||
| #define YY_DECL_IS_OURS 1 | ||||
|  | ||||
| extern int linguist_yylex (yyscan_t yyscanner); | ||||
|  | ||||
| #define YY_DECL int linguist_yylex (yyscan_t yyscanner) | ||||
| #endif /* !YY_DECL */ | ||||
|  | ||||
| /* yy_get_previous_state - get the state just before the EOB char was reached */ | ||||
|  | ||||
| #undef YY_NEW_FILE | ||||
| #undef YY_FLUSH_BUFFER | ||||
| #undef yy_set_bol | ||||
| #undef yy_new_buffer | ||||
| #undef yy_set_interactive | ||||
| #undef YY_DO_BEFORE_ACTION | ||||
|  | ||||
| #ifdef YY_DECL_IS_OURS | ||||
| #undef YY_DECL_IS_OURS | ||||
| #undef YY_DECL | ||||
| #endif | ||||
|  | ||||
| #line 117 "tokenizer.l" | ||||
|  | ||||
|  | ||||
| #line 352 "lex.linguist_yy.h" | ||||
| #undef linguist_yyIN_HEADER | ||||
| #endif /* linguist_yyHEADER_H */ | ||||
| @@ -1,64 +0,0 @@ | ||||
| #include "ruby.h" | ||||
| #include "linguist.h" | ||||
| #include "lex.linguist_yy.h" | ||||
|  | ||||
| int linguist_yywrap(yyscan_t yyscanner) { | ||||
| 	return 1; | ||||
| } | ||||
|  | ||||
| static VALUE rb_tokenizer_extract_tokens(VALUE self, VALUE rb_data) { | ||||
| 	YY_BUFFER_STATE buf; | ||||
| 	yyscan_t scanner; | ||||
| 	struct tokenizer_extra extra; | ||||
| 	VALUE ary, s; | ||||
| 	long len; | ||||
| 	int r; | ||||
|  | ||||
| 	Check_Type(rb_data, T_STRING); | ||||
|  | ||||
| 	len = RSTRING_LEN(rb_data); | ||||
| 	if (len > 100000) | ||||
| 		len = 100000; | ||||
|  | ||||
| 	linguist_yylex_init_extra(&extra, &scanner); | ||||
| 	buf = linguist_yy_scan_bytes(RSTRING_PTR(rb_data), (int) len, scanner); | ||||
|  | ||||
| 	ary = rb_ary_new(); | ||||
| 	do { | ||||
| 		extra.type = NO_ACTION; | ||||
| 		extra.token = NULL; | ||||
| 		r = linguist_yylex(scanner); | ||||
| 		switch (extra.type) { | ||||
| 		case NO_ACTION: | ||||
| 			break; | ||||
| 		case REGULAR_TOKEN: | ||||
| 			rb_ary_push(ary, rb_str_new2(extra.token)); | ||||
| 			free(extra.token); | ||||
| 			break; | ||||
| 		case SHEBANG_TOKEN: | ||||
| 			s = rb_str_new2("SHEBANG#!"); | ||||
| 			rb_str_cat2(s, extra.token); | ||||
| 			rb_ary_push(ary, s); | ||||
| 			free(extra.token); | ||||
| 			break; | ||||
| 		case SGML_TOKEN: | ||||
| 			s = rb_str_new2(extra.token); | ||||
| 			rb_str_cat2(s, ">"); | ||||
| 			rb_ary_push(ary, s); | ||||
| 			free(extra.token); | ||||
| 			break; | ||||
| 		} | ||||
| 	} while (r); | ||||
|  | ||||
| 	linguist_yy_delete_buffer(buf, scanner); | ||||
| 	linguist_yylex_destroy(scanner); | ||||
|  | ||||
| 	return ary; | ||||
| } | ||||
|  | ||||
| __attribute__((visibility("default"))) void Init_linguist() { | ||||
| 	VALUE rb_mLinguist = rb_define_module("Linguist"); | ||||
| 	VALUE rb_cTokenizer = rb_define_class_under(rb_mLinguist, "Tokenizer", rb_cObject); | ||||
|  | ||||
| 	rb_define_method(rb_cTokenizer, "extract_tokens", rb_tokenizer_extract_tokens, 1); | ||||
| } | ||||
| @@ -1,11 +0,0 @@ | ||||
| enum tokenizer_type { | ||||
|   NO_ACTION, | ||||
|   REGULAR_TOKEN, | ||||
|   SHEBANG_TOKEN, | ||||
|   SGML_TOKEN, | ||||
| }; | ||||
|  | ||||
| struct tokenizer_extra { | ||||
|   char *token; | ||||
|   enum tokenizer_type type; | ||||
| }; | ||||
| @@ -1,119 +0,0 @@ | ||||
| %{ | ||||
|  | ||||
| #include "linguist.h" | ||||
|  | ||||
| #define feed_token(tok, typ) do { \ | ||||
|     yyextra->token = (tok); \ | ||||
|     yyextra->type = (typ); \ | ||||
|   } while (0) | ||||
|  | ||||
| #define eat_until_eol() do { \ | ||||
|     int c; \ | ||||
|     while ((c = input(yyscanner)) != '\n' && c != EOF); \ | ||||
|     if (c == EOF) \ | ||||
|       yyterminate(); \ | ||||
|   } while (0) | ||||
|  | ||||
| #define eat_until_unescaped(q) do { \ | ||||
|     int c; \ | ||||
|     while ((c = input(yyscanner)) != EOF) { \ | ||||
|       if (c == '\n') \ | ||||
|         break; \ | ||||
|       if (c == '\\') { \ | ||||
|         c = input(yyscanner); \ | ||||
|         if (c == EOF) \ | ||||
|           yyterminate(); \ | ||||
|       } else if (c == q) \ | ||||
|         break; \ | ||||
|     } \ | ||||
|     if (c == EOF) \ | ||||
|       yyterminate(); \ | ||||
|   } while (0) | ||||
|  | ||||
| %} | ||||
|  | ||||
| %option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy" | ||||
| %x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment | ||||
|  | ||||
| %% | ||||
|  | ||||
| ^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ { | ||||
|     const char *off = strrchr(yytext, ' '); | ||||
|     if (!off) | ||||
|       off = yytext; | ||||
|     else | ||||
|       ++off; | ||||
|     feed_token(strdup(off), SHEBANG_TOKEN); | ||||
|     eat_until_eol(); | ||||
|     return 1; | ||||
|   } | ||||
|  | ||||
| ^#![ \t]*[[:alpha:]_\/]+ { | ||||
|     const char *off = strrchr(yytext, '/'); | ||||
|     if (!off) | ||||
|       off = yytext; | ||||
|     else | ||||
|       ++off; | ||||
|     if (strcmp(off, "env") == 0) { | ||||
|       eat_until_eol(); | ||||
|     } else { | ||||
|       feed_token(strdup(off), SHEBANG_TOKEN); | ||||
|       eat_until_eol(); | ||||
|       return 1; | ||||
|     } | ||||
|   } | ||||
|  | ||||
| ^[ \t]*(\/\/|--|\#|%|\")" ".*   { /* nothing */ } | ||||
|  | ||||
| "/*"                              { BEGIN(c_comment); } | ||||
|   /* See below for xml_comment start. */ | ||||
| "{-"                              { BEGIN(haskell_comment); } | ||||
| "(*"                              { BEGIN(ocaml_comment); } | ||||
| "\"\"\""                          { BEGIN(python_dcomment); } | ||||
| "'''"                             { BEGIN(python_scomment); } | ||||
|  | ||||
| <c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|\n { /* nothing */ } | ||||
| <c_comment>"*/"                   { BEGIN(INITIAL); } | ||||
| <xml_comment>"-->"                { BEGIN(INITIAL); } | ||||
| <haskell_comment>"-}"             { BEGIN(INITIAL); } | ||||
| <ocaml_comment>"*)"               { BEGIN(INITIAL); } | ||||
| <python_dcomment>"\"\"\""         { BEGIN(INITIAL); } | ||||
| <python_scomment>"'''"            { BEGIN(INITIAL); } | ||||
|  | ||||
| \"\"|''                           { /* nothing */ } | ||||
| \"                                { eat_until_unescaped('"'); } | ||||
| '                                 { eat_until_unescaped('\''); } | ||||
| (0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ } | ||||
| \<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}>               { | ||||
|     if (strcmp(yytext, "<!--") == 0) { | ||||
|      BEGIN(xml_comment); | ||||
|     } else { | ||||
|       feed_token(strdup(yytext), SGML_TOKEN); | ||||
|       BEGIN(sgml); | ||||
|       return 1; | ||||
|     } | ||||
|   } | ||||
| <sgml>[[:alnum:]_]+=/\"           { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; } | ||||
| <sgml>[[:alnum:]_]+=/'            { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; } | ||||
| <sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; } | ||||
| <sgml>[[:alnum:]_]+               { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } | ||||
| <sgml>\>                          { BEGIN(INITIAL); } | ||||
| <sgml>.|\n                        { /* nothing */ } | ||||
| ;|\{|\}|\(|\)|\[|\]               { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } | ||||
| [[:alnum:]_.@#/*]+                { | ||||
|     if (strncmp(yytext, "/*", 2) == 0) { | ||||
|       if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) { | ||||
|         /* nothing */ | ||||
|       } else { | ||||
|         BEGIN(c_comment); | ||||
|       } | ||||
|     } else { | ||||
|       feed_token(strdup(yytext), REGULAR_TOKEN); | ||||
|       return 1; | ||||
|     } | ||||
|   } | ||||
| \<\<?|\+|\-|\*|\/|%|&&?|\|\|?     { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } | ||||
| .|\n                              { /* nothing */ } | ||||
|  | ||||
| %% | ||||
|  | ||||
| @@ -10,9 +10,8 @@ Gem::Specification.new do |s| | ||||
|   s.homepage = "https://github.com/github/linguist" | ||||
|   s.license  = "MIT" | ||||
|  | ||||
|   s.files = Dir['lib/**/*'] + Dir['ext/**/*'] + Dir['grammars/*'] + ['LICENSE'] | ||||
|   s.files = Dir['lib/**/*'] + Dir['grammars/*'] + ['LICENSE'] | ||||
|   s.executables = ['linguist', 'git-linguist'] | ||||
|   s.extensions = ['ext/linguist/extconf.rb'] | ||||
|  | ||||
|   s.add_dependency 'charlock_holmes', '~> 0.7.5' | ||||
|   s.add_dependency 'escape_utils',    '~> 1.1.0' | ||||
| @@ -20,7 +19,6 @@ Gem::Specification.new do |s| | ||||
|   s.add_dependency 'rugged',          '>= 0.25.1' | ||||
|  | ||||
|   s.add_development_dependency 'minitest', '>= 5.0' | ||||
|   s.add_development_dependency 'rake-compiler', '~> 0.9' | ||||
|   s.add_development_dependency 'mocha' | ||||
|   s.add_development_dependency 'plist', '~>3.1' | ||||
|   s.add_development_dependency 'pry' | ||||
|   | ||||
| @@ -275,8 +275,10 @@ module Linguist | ||||
|           # also--importantly--without having to duplicate many (potentially | ||||
|           # large) strings. | ||||
|           begin | ||||
|             encoded_newlines = ["\r\n", "\r", "\n"]. | ||||
|               map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) } | ||||
|  | ||||
|             data.split(encoded_newlines_re, -1) | ||||
|             data.split(Regexp.union(encoded_newlines), -1) | ||||
|           rescue Encoding::ConverterNotFoundError | ||||
|             # The data is not splittable in the detected encoding.  Assume it's | ||||
|             # one big line. | ||||
| @@ -287,51 +289,6 @@ module Linguist | ||||
|         end | ||||
|     end | ||||
|  | ||||
|     def encoded_newlines_re | ||||
|       @encoded_newlines_re ||= Regexp.union(["\r\n", "\r", "\n"]. | ||||
|                                               map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) }) | ||||
|  | ||||
|     end | ||||
|  | ||||
|     def first_lines(n) | ||||
|       return lines[0...n] if defined? @lines | ||||
|       return [] unless viewable? && data | ||||
|  | ||||
|       i, c = 0, 0 | ||||
|       while c < n && j = data.index(encoded_newlines_re, i) | ||||
|         i = j + $&.length | ||||
|         c += 1 | ||||
|       end | ||||
|       data[0...i].split(encoded_newlines_re, -1) | ||||
|     end | ||||
|  | ||||
|     def last_lines(n) | ||||
|       if defined? @lines | ||||
|         if n >= @lines.length | ||||
|           @lines | ||||
|         else | ||||
|           lines[-n..-1] | ||||
|         end | ||||
|       end | ||||
|       return [] unless viewable? && data | ||||
|  | ||||
|       no_eol = true | ||||
|       i, c = data.length, 0 | ||||
|       k = i | ||||
|       while c < n && j = data.rindex(encoded_newlines_re, i - 1) | ||||
|         if c == 0 && j + $&.length == i | ||||
|           no_eol = false | ||||
|           n += 1 | ||||
|         end | ||||
|         i = j | ||||
|         k = j + $&.length | ||||
|         c += 1 | ||||
|       end | ||||
|       r = data[k..-1].split(encoded_newlines_re, -1) | ||||
|       r.pop if !no_eol | ||||
|       r | ||||
|     end | ||||
|  | ||||
|     # Public: Get number of lines of code | ||||
|     # | ||||
|     # Requires Blob#data | ||||
|   | ||||
| @@ -3,8 +3,6 @@ require 'linguist/tokenizer' | ||||
| module Linguist | ||||
|   # Language bayesian classifier. | ||||
|   class Classifier | ||||
|     CLASSIFIER_CONSIDER_BYTES = 50 * 1024 | ||||
|  | ||||
|     # Public: Use the classifier to detect language of the blob. | ||||
|     # | ||||
|     # blob               - An object that quacks like a blob. | ||||
| @@ -19,7 +17,7 @@ module Linguist | ||||
|     # Returns an Array of Language objects, most probable first. | ||||
|     def self.call(blob, possible_languages) | ||||
|       language_names = possible_languages.map(&:name) | ||||
|       classify(Samples.cache, blob.data[0...CLASSIFIER_CONSIDER_BYTES], language_names).map do |name, _| | ||||
|       classify(Samples.cache, blob.data, language_names).map do |name, _| | ||||
|         Language[name] # Return the actual Language objects | ||||
|       end | ||||
|     end | ||||
|   | ||||
| @@ -23,21 +23,21 @@ module Linguist | ||||
|     # | ||||
|     # Returns a String like '100644' | ||||
|     def mode | ||||
|       @mode ||= File.stat(@fullpath).mode.to_s(8) | ||||
|       File.stat(@fullpath).mode.to_s(8) | ||||
|     end | ||||
|  | ||||
|     # Public: Read file contents. | ||||
|     # | ||||
|     # Returns a String. | ||||
|     def data | ||||
|       @data ||= File.read(@fullpath) | ||||
|       File.read(@fullpath) | ||||
|     end | ||||
|  | ||||
|     # Public: Get byte size | ||||
|     # | ||||
|     # Returns an Integer. | ||||
|     def size | ||||
|       @size ||= File.size(@fullpath) | ||||
|       File.size(@fullpath) | ||||
|     end | ||||
|   end | ||||
| end | ||||
|   | ||||
| @@ -1,8 +1,6 @@ | ||||
| module Linguist | ||||
|   # A collection of simple heuristics that can be used to better analyze languages. | ||||
|   class Heuristics | ||||
|     HEURISTICS_CONSIDER_BYTES = 50 * 1024 | ||||
|  | ||||
|     # Public: Use heuristics to detect language of the blob. | ||||
|     # | ||||
|     # blob               - An object that quacks like a blob. | ||||
| @@ -16,7 +14,7 @@ module Linguist | ||||
|     # | ||||
|     # Returns an Array of languages, or empty if none matched or were inconclusive. | ||||
|     def self.call(blob, candidates) | ||||
|       data = blob.data[0...HEURISTICS_CONSIDER_BYTES] | ||||
|       data = blob.data | ||||
|  | ||||
|       @heuristics.each do |heuristic| | ||||
|         if heuristic.matches?(blob.name, candidates) | ||||
| @@ -74,14 +72,6 @@ module Linguist | ||||
|  | ||||
|     # Common heuristics | ||||
|     ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/ | ||||
|     CPlusPlusRegex = Regexp.union( | ||||
|         /^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/, | ||||
|         /^\s*template\s*</, | ||||
|         /^[ \t]*try/, | ||||
|         /^[ \t]*catch\s*\(/, | ||||
|         /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/, | ||||
|         /^[ \t]*(private|public|protected):$/, | ||||
|         /std::\w+/) | ||||
|  | ||||
|     disambiguate ".as" do |data| | ||||
|       if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data) | ||||
| @@ -229,7 +219,8 @@ module Linguist | ||||
|     disambiguate ".h" do |data| | ||||
|       if ObjectiveCRegex.match(data) | ||||
|         Language["Objective-C"] | ||||
|       elsif CPlusPlusRegex.match(data) | ||||
|       elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) || | ||||
|         /^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data)) | ||||
|         Language["C++"] | ||||
|       end | ||||
|     end | ||||
|   | ||||
| @@ -109,8 +109,8 @@ module Linguist | ||||
|       # Returns an Array with one Language if the blob has a Vim or Emacs modeline | ||||
|       # that matches a Language name or alias. Returns an empty array if no match. | ||||
|       def self.call(blob, _ = nil) | ||||
|         header = blob.first_lines(SEARCH_SCOPE).join("\n") | ||||
|         footer = blob.last_lines(SEARCH_SCOPE).join("\n") | ||||
|         header = blob.lines.first(SEARCH_SCOPE).join("\n") | ||||
|         footer = blob.lines.last(SEARCH_SCOPE).join("\n") | ||||
|         Array(Language.find_by_alias(modeline(header + footer))) | ||||
|       end | ||||
|  | ||||
|   | ||||
| @@ -1,5 +1,4 @@ | ||||
| require 'strscan' | ||||
| require 'linguist/linguist' | ||||
|  | ||||
| module Linguist | ||||
|   # Generic programming language tokenizer. | ||||
| @@ -16,5 +15,191 @@ module Linguist | ||||
|     def self.tokenize(data) | ||||
|       new.extract_tokens(data) | ||||
|     end | ||||
|  | ||||
|     # Read up to 100KB | ||||
|     BYTE_LIMIT = 100_000 | ||||
|  | ||||
|     # Start state on token, ignore anything till the next newline | ||||
|     SINGLE_LINE_COMMENTS = [ | ||||
|       '//', # C | ||||
|       '--', # Ada, Haskell, AppleScript | ||||
|       '#',  # Ruby | ||||
|       '%',  # Tex | ||||
|       '"',  # Vim | ||||
|     ] | ||||
|  | ||||
|     # Start state on opening token, ignore anything until the closing | ||||
|     # token is reached. | ||||
|     MULTI_LINE_COMMENTS = [ | ||||
|       ['/*', '*/'],    # C | ||||
|       ['<!--', '-->'], # XML | ||||
|       ['{-', '-}'],    # Haskell | ||||
|       ['(*', '*)'],    # Coq | ||||
|       ['"""', '"""'],  # Python | ||||
|       ["'''", "'''"]   # Python | ||||
|     ] | ||||
|  | ||||
|     START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c| | ||||
|       "\s*#{Regexp.escape(c)} " | ||||
|     }.join("|")) | ||||
|  | ||||
|     START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c| | ||||
|       Regexp.escape(c[0]) | ||||
|     }.join("|")) | ||||
|  | ||||
|     # Internal: Extract generic tokens from data. | ||||
|     # | ||||
|     # data - String to scan. | ||||
|     # | ||||
|     # Examples | ||||
|     # | ||||
|     #   extract_tokens("printf('Hello')") | ||||
|     #   # => ['printf', '(', ')'] | ||||
|     # | ||||
|     # Returns Array of token Strings. | ||||
|     def extract_tokens(data) | ||||
|       s = StringScanner.new(data) | ||||
|  | ||||
|       tokens = [] | ||||
|       until s.eos? | ||||
|         break if s.pos >= BYTE_LIMIT | ||||
|  | ||||
|         if token = s.scan(/^#!.+$/) | ||||
|           if name = extract_shebang(token) | ||||
|             tokens << "SHEBANG#!#{name}" | ||||
|           end | ||||
|  | ||||
|         # Single line comment | ||||
|         elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT) | ||||
|           # tokens << token.strip | ||||
|           s.skip_until(/\n|\Z/) | ||||
|  | ||||
|         # Multiline comments | ||||
|         elsif token = s.scan(START_MULTI_LINE_COMMENT) | ||||
|           # tokens << token | ||||
|           close_token = MULTI_LINE_COMMENTS.assoc(token)[1] | ||||
|           s.skip_until(Regexp.compile(Regexp.escape(close_token))) | ||||
|           # tokens << close_token | ||||
|  | ||||
|         # Skip single or double quoted strings | ||||
|         elsif s.scan(/"/) | ||||
|           if s.peek(1) == "\"" | ||||
|             s.getch | ||||
|           else | ||||
|             s.skip_until(/(?<!\\)"/) | ||||
|           end | ||||
|         elsif s.scan(/'/) | ||||
|           if s.peek(1) == "'" | ||||
|             s.getch | ||||
|           else | ||||
|             s.skip_until(/(?<!\\)'/) | ||||
|           end | ||||
|  | ||||
|         # Skip number literals | ||||
|         elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/) | ||||
|  | ||||
|         # SGML style brackets | ||||
|         elsif token = s.scan(/<[^\s<>][^<>]*>/) | ||||
|           extract_sgml_tokens(token).each { |t| tokens << t } | ||||
|  | ||||
|         # Common programming punctuation | ||||
|         elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/) | ||||
|           tokens << token | ||||
|  | ||||
|         # Regular token | ||||
|         elsif token = s.scan(/[\w\.@#\/\*]+/) | ||||
|           tokens << token | ||||
|  | ||||
|         # Common operators | ||||
|         elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/) | ||||
|           tokens << token | ||||
|  | ||||
|         else | ||||
|           s.getch | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       tokens | ||||
|     end | ||||
|  | ||||
|     # Internal: Extract normalized shebang command token. | ||||
|     # | ||||
|     # Examples | ||||
|     # | ||||
|     #   extract_shebang("#!/usr/bin/ruby") | ||||
|     #   # => "ruby" | ||||
|     # | ||||
|     #   extract_shebang("#!/usr/bin/env node") | ||||
|     #   # => "node" | ||||
|     # | ||||
|     #   extract_shebang("#!/usr/bin/env A=B foo=bar awk -f") | ||||
|     #   # => "awk" | ||||
|     # | ||||
|     # Returns String token or nil it couldn't be parsed. | ||||
|     def extract_shebang(data) | ||||
|       s = StringScanner.new(data) | ||||
|  | ||||
|       if path = s.scan(/^#!\s*\S+/) | ||||
|         script = path.split('/').last | ||||
|         if script == 'env' | ||||
|           s.scan(/\s+/) | ||||
|           s.scan(/.*=[^\s]+\s+/) | ||||
|           script = s.scan(/\S+/) | ||||
|         end | ||||
|         script = script[/[^\d]+/, 0] if script | ||||
|         return script | ||||
|       end | ||||
|  | ||||
|       nil | ||||
|     end | ||||
|  | ||||
|     # Internal: Extract tokens from inside SGML tag. | ||||
|     # | ||||
|     # data - SGML tag String. | ||||
|     # | ||||
|     # Examples | ||||
|     # | ||||
|     #   extract_sgml_tokens("<a href='' class=foo>") | ||||
|     #   # => ["<a>", "href="] | ||||
|     # | ||||
|     # Returns Array of token Strings. | ||||
|     def extract_sgml_tokens(data) | ||||
|       s = StringScanner.new(data) | ||||
|  | ||||
|       tokens = [] | ||||
|  | ||||
|       until s.eos? | ||||
|         # Emit start token | ||||
|         if token = s.scan(/<\/?[^\s>]+/) | ||||
|           tokens << "#{token}>" | ||||
|  | ||||
|         # Emit attributes with trailing = | ||||
|         elsif token = s.scan(/\w+=/) | ||||
|           tokens << token | ||||
|  | ||||
|           # Then skip over attribute value | ||||
|           if s.scan(/"/) | ||||
|             s.skip_until(/[^\\]"/) | ||||
|           elsif s.scan(/'/) | ||||
|             s.skip_until(/[^\\]'/) | ||||
|           else | ||||
|             s.skip_until(/\w+/) | ||||
|           end | ||||
|  | ||||
|         # Emit lone attributes | ||||
|         elsif token = s.scan(/\w+/) | ||||
|           tokens << token | ||||
|  | ||||
|         # Stop at the end of the tag | ||||
|         elsif s.scan(/>/) | ||||
|           s.terminate | ||||
|  | ||||
|         else | ||||
|           s.getch | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       tokens | ||||
|     end | ||||
|   end | ||||
| end | ||||
|   | ||||
		Reference in New Issue
	
	Block a user