From 9a8ab45b6f91c5caaae13ff4ba46135c05081aa2 Mon Sep 17 00:00:00 2001 From: Ashe Connor Date: Fri, 1 Dec 2017 13:41:59 +1100 Subject: [PATCH] Limit tokens to 64 characters or less (#3922) --- ext/linguist/linguist.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/ext/linguist/linguist.c b/ext/linguist/linguist.c index b31dea88..ce5fe001 100644 --- a/ext/linguist/linguist.c +++ b/ext/linguist/linguist.c @@ -2,6 +2,9 @@ #include "linguist.h" #include "lex.linguist_yy.h" +// Anything longer is unlikely to be useful. +#define MAX_TOKEN_LEN 64 + int linguist_yywrap(yyscan_t yyscanner) { return 1; } @@ -32,19 +35,27 @@ static VALUE rb_tokenizer_extract_tokens(VALUE self, VALUE rb_data) { case NO_ACTION: break; case REGULAR_TOKEN: - rb_ary_push(ary, rb_str_new2(extra.token)); + len = strlen(extra.token); + if (len <= MAX_TOKEN_LEN) + rb_ary_push(ary, rb_str_new(extra.token, len)); free(extra.token); break; case SHEBANG_TOKEN: - s = rb_str_new2("SHEBANG#!"); - rb_str_cat2(s, extra.token); - rb_ary_push(ary, s); + len = strlen(extra.token); + if (len <= MAX_TOKEN_LEN) { + s = rb_str_new2("SHEBANG#!"); + rb_str_cat(s, extra.token, len); + rb_ary_push(ary, s); + } free(extra.token); break; case SGML_TOKEN: - s = rb_str_new2(extra.token); - rb_str_cat2(s, ">"); - rb_ary_push(ary, s); + len = strlen(extra.token); + if (len <= MAX_TOKEN_LEN) { + s = rb_str_new(extra.token, len); + rb_str_cat2(s, ">"); + rb_ary_push(ary, s); + } free(extra.token); break; }