diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb index ef3d54b7..fcd88efc 100644 --- a/lib/linguist/tokenizer.rb +++ b/lib/linguist/tokenizer.rb @@ -16,6 +16,9 @@ module Linguist new.extract_tokens(data) end + # Read up to 100KB + BYTE_LIMIT = 100_000 + # Start state on token, ignore anything till the next newline SINGLE_LINE_COMMENTS = [ '//', # C @@ -55,6 +58,8 @@ module Linguist tokens = [] until s.eos? + break if s.pos >= BYTE_LIMIT + if token = s.scan(/^#!.+$/) if name = extract_shebang(token) tokens << "SHEBANG#!#{name}"