From b275b5d728fd0330f8df29513a8c935528e2bc36 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Fri, 4 Sep 2015 10:24:06 +0200 Subject: [PATCH 1/3] Soften memory pressure --- lib/linguist/generated.rb | 8 ++++++-- lib/linguist/heuristics.rb | 3 ++- lib/linguist/lazy_blob.rb | 4 ++++ lib/linguist/repository.rb | 7 +++++-- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/lib/linguist/generated.rb b/lib/linguist/generated.rb index f1fb2d19..d107e737 100644 --- a/lib/linguist/generated.rb +++ b/lib/linguist/generated.rb @@ -241,22 +241,26 @@ module Linguist return lines[0].include?("Code generated by") end + PROTOBUF_EXTENSIONS = ['.py', '.java', '.h', '.cc', '.cpp'] + # Internal: Is the blob a C++, Java or Python source file generated by the # Protocol Buffer compiler? # # Returns true of false. def generated_protocol_buffer? - return false unless ['.py', '.java', '.h', '.cc', '.cpp'].include?(extname) + return false unless PROTOBUF_EXTENSIONS.include?(extname) return false unless lines.count > 1 return lines[0].include?("Generated by the protocol buffer compiler. DO NOT EDIT!") end + APACHE_THRIFT_EXTENSIONS = ['.rb', '.py', '.go', '.js', '.m', '.java', '.h', '.cc', '.cpp'] + # Internal: Is the blob generated by Apache Thrift compiler? # # Returns true or false def generated_apache_thrift? - return false unless ['.rb', '.py', '.go', '.js', '.m', '.java', '.h', '.cc', '.cpp'].include?(extname) + return false unless APACHE_THRIFT_EXTENSIONS.include?(extname) return false unless lines.count > 1 return lines[0].include?("Autogenerated by Thrift Compiler") || lines[1].include?("Autogenerated by Thrift Compiler") diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 1660d99f..11f58b28 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -56,7 +56,8 @@ module Linguist # Internal: Check if this heuristic matches the candidate languages. def matches?(filename) - @extensions.any? { |ext| filename.downcase.end_with?(ext) } + filename = filename.downcase + @extensions.any? { |ext| filename.end_with?(ext) } end # Internal: Perform the heuristic diff --git a/lib/linguist/lazy_blob.rb b/lib/linguist/lazy_blob.rb index 55c10309..28fb78f3 100644 --- a/lib/linguist/lazy_blob.rb +++ b/lib/linguist/lazy_blob.rb @@ -79,6 +79,10 @@ module Linguist @size end + def cleanup! + @data.clear if @data + end + protected # Returns true if the attribute is present and not the string "false". diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index 181ddf0e..01e595da 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -157,8 +157,11 @@ module Linguist blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, mode.to_s(8)) - next unless blob.include_in_language_stats? - file_map[new] = [blob.language.group.name, blob.size] + if blob.include_in_language_stats? + file_map[new] = [blob.language.group.name, blob.size] + end + + blob.cleanup! end end From 13d1f662d1ec2bf1ce3af6eef2459f520f9701fe Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Fri, 4 Sep 2015 15:11:29 +0200 Subject: [PATCH 2/3] Add the `git-linguist` helper --- bin/git-linguist | 141 ++++++++++++++++++++++++++++++++++++++++ github-linguist.gemspec | 2 +- lib/linguist/version.rb | 2 +- 3 files changed, 143 insertions(+), 2 deletions(-) create mode 100755 bin/git-linguist diff --git a/bin/git-linguist b/bin/git-linguist new file mode 100755 index 00000000..f761cfaf --- /dev/null +++ b/bin/git-linguist @@ -0,0 +1,141 @@ +#!/usr/bin/env ruby + +require 'linguist' +require 'rugged' +require 'optparse' +require 'json' +require 'tmpdir' +require 'zlib' + +class GitLinguist + attr_reader :repo_path + attr_reader :commit_oid + attr_reader :incremental + + def initialize(path, commit_oid, incremental = true) + @repo_path = path + @commit_oid = commit_oid || rugged.head.target_id + @incremental = incremental + end + + def linguist + repo = Linguist::Repository.new(rugged, commit_oid) + + if incremental && stats = load_language_stats + old_commit_oid, old_stats = stats + + # A cache with NULL oid means that we want to froze + # these language stats in place and stop computing + # them (for performance reasons) + return old_stats if old_commit_oid == NULL_OID + repo.load_existing_stats(old_commit_oid, old_stats) + end + + result = yield repo + + save_language_stats(commit_oid, repo.cache) + result + end + + def load_language_stats + version, commit_oid, stats = load_cache + if version == LANGUAGE_STATS_CACHE_VERSION && commit_oid && stats + [commit_oid, stats] + end + end + + def save_language_stats(commit_oid, stats) + cache = [LANGUAGE_STATS_CACHE_VERSION, commit_oid, stats] + write_cache(cache) + end + + def clear_language_stats + File.unlink(cache_file) + end + + def disable_language_stats + save_language_stats(NULL_OID, {}) + end + + protected + NULL_OID = ("0" * 40).freeze + + LANGUAGE_STATS_CACHE = 'language-stats.cache' + LANGUAGE_STATS_CACHE_VERSION = "v3:#{Linguist::VERSION}" + + def rugged + @rugged ||= Rugged::Repository.bare(repo_path) + end + + def cache_file + File.join(repo_path, LANGUAGE_STATS_CACHE) + end + + def write_cache(object) + tmp_path = Dir::Tmpname.make_tmpname(cache_file, nil) + + File.open(tmp_path, "wb") do |f| + marshal = Marshal.dump(object) + f.write(Zlib::Deflate.deflate(marshal)) + end + + File.rename(tmp_path, cache_file) + tmp_path = nil + ensure + (File.unlink(tmp_path) rescue nil) if tmp_path + end + + def load_cache + marshal = File.open(cache_file, "rb") { |f| Zlib::Inflate.inflate(f.read) } + Marshal.load(marshal) + rescue SystemCallError, ::Zlib::DataError, ::Zlib::BufError, TypeError + nil + end +end + + +def git_linguist(args) + incremental = true + commit = nil + git_dir = nil + + parser = OptionParser.new do |opts| + opts.banner = "Usage: git-linguist [OPTIONS] stats|breakdown|dump-cache|clear|disable" + + opts.on("-f", "--force", "Force a full rescan") { incremental = false } + opts.on("--git-dir=DIR", "Path to the git repository") { |v| git_dir = v } + opts.on("--commit=COMMIT", "Commit to index") { |v| commit = v} + end + + parser.parse!(args) + + git_dir ||= begin + pwd = Dir.pwd + dotgit = File.join(pwd, ".git") + File.directory?(dotgit) ? dotgit : pwd + end + + wrapper = GitLinguist.new(git_dir, commit, incremental) + + case args.pop + when "stats" + wrapper.linguist do |linguist| + puts JSON.dump(linguist.languages) + end + when "breakdown" + wrapper.linguist do |linguist| + puts JSON.dump(linguist.breakdown_by_file) + end + when "dump-cache" + puts JSON.dump(wrapper.load_language_stats) + when "clear" + wrapper.clear_language_stats + when "disable" + wrapper.disable_language_stats + else + $stderr.print(parser.help) + exit 1 + end +end + +git_linguist(ARGV) diff --git a/github-linguist.gemspec b/github-linguist.gemspec index 87bbc8bf..8e043857 100644 --- a/github-linguist.gemspec +++ b/github-linguist.gemspec @@ -11,7 +11,7 @@ Gem::Specification.new do |s| s.license = "MIT" s.files = Dir['lib/**/*'] - ['lib/linguist/grammars.rb'] - s.executables << 'linguist' + s.executables = ['linguist', 'git-linguist'] s.add_dependency 'charlock_holmes', '~> 0.7.3' s.add_dependency 'escape_utils', '~> 1.1.0' diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index dbdb7d7f..0ee8b185 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "4.5.15" + VERSION = "4.6.0.rc3" end From c2c068e9db1fdb900716dcc9373e37ff5d2a4ca2 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Mon, 14 Sep 2015 08:43:10 -0700 Subject: [PATCH 3/3] Bump version to 4.6.0 --- lib/linguist/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index 0ee8b185..a90a01fa 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "4.6.0.rc3" + VERSION = "4.6.0" end