From c4260ae6814e0b60fa97db1359d1044057df46aa Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Mon, 16 Jun 2014 15:19:56 +0200 Subject: [PATCH 01/20] Use Rugged when computing Repository stats --- Gemfile | 2 + lib/linguist/blob_helper.rb | 10 +-- lib/linguist/file_blob.rb | 4 +- lib/linguist/language.rb | 14 ++-- lib/linguist/repository.rb | 128 ++++++++++++++++++++---------------- test/test_heuristics.rb | 5 -- test/test_repository.rb | 11 +--- 7 files changed, 82 insertions(+), 92 deletions(-) diff --git a/Gemfile b/Gemfile index 851fabc2..3ac3ffa4 100644 --- a/Gemfile +++ b/Gemfile @@ -1,2 +1,4 @@ source 'https://rubygems.org' + gemspec +gem 'rugged', :git => 'https://github.com/libgit2/rugged.git', branch: 'development', submodules: true diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 15ab2d9f..27c4b3dd 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -313,15 +313,7 @@ module Linguist # # Returns a Language or nil if none is detected def language - return @language if defined? @language - - if defined?(@data) && @data.is_a?(String) - data = @data - else - data = lambda { (binary_mime_type? || binary?) ? "" : self.data } - end - - @language = Language.detect(name.to_s, data, mode) + @language ||= Language.detect(self) end # Internal: Get the lexer of the blob. diff --git a/lib/linguist/file_blob.rb b/lib/linguist/file_blob.rb index 7e7f1acd..e84e9b61 100644 --- a/lib/linguist/file_blob.rb +++ b/lib/linguist/file_blob.rb @@ -34,9 +34,9 @@ module Linguist # Public: Read file permissions # - # Returns a String like '100644' + # Returns an Int like 0100644 def mode - File.stat(@path).mode.to_s(8) + File.stat(@path).mode end # Public: Read file contents. diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index a8e7a33c..3a354463 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -92,18 +92,14 @@ module Linguist # Public: Detects the Language of the blob. # - # name - String filename - # data - String blob data. A block also maybe passed in for lazy - # loading. This behavior is deprecated and you should always - # pass in a String. - # mode - Optional String mode (defaults to nil) - # # Returns Language or nil. - def self.detect(name, data, mode = nil) + def self.detect(blob) + name = blob.name + # A bit of an elegant hack. If the file is executable but extensionless, # append a "magic" extension so it can be classified with other # languages that have shebang scripts. - if File.extname(name).empty? && mode && (mode.to_i(8) & 05) == 05 + if File.extname(name).empty? && blob.mode && (blob.mode & 05) == 05 name += ".script!" end @@ -114,7 +110,7 @@ module Linguist # extension at all, in the case of extensionless scripts), we need to continue # our detection work if possible_languages.length > 1 - data = data.call() if data.respond_to?(:call) + data = blob.data possible_language_names = possible_languages.map(&:name) # Don't bother with emptiness diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index a62bf281..bbf32171 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -1,4 +1,6 @@ require 'linguist/file_blob' +require 'linguist/lazy_blob' +require 'rugged' module Linguist # A Repository is an abstraction of a Grit::Repo or a basic file @@ -7,29 +9,15 @@ module Linguist # Its primary purpose is for gathering language statistics across # the entire project. class Repository - # Public: Initialize a new Repository from a File directory - # - # base_path - A path String - # - # Returns a Repository - def self.from_directory(base_path) - new Dir["#{base_path}/**/*"]. - select { |f| File.file?(f) }. - map { |path| FileBlob.new(path, base_path) } - end + attr_reader :repository # Public: Initialize a new Repository # - # enum - Enumerator that responds to `each` and - # yields Blob objects - # # Returns a Repository - def initialize(enum) - @enum = enum - @computed_stats = false - @language = @size = nil - @sizes = Hash.new { 0 } - @file_breakdown = Hash.new { |h,k| h[k] = Array.new } + def initialize(repo, sha1, existing_stats = nil) + @repository = repo + @current_sha1 = sha1 + @old_sha1, @old_stats = existing_stats if existing_stats end # Public: Returns a breakdown of language stats. @@ -41,66 +29,90 @@ module Linguist # # Returns a Hash of Language keys and Integer size values. def languages - compute_stats - @sizes + @sizes ||= begin + sizes = Hash.new { 0 } + file_map.each do |_, (language, size)| + sizes[language] += size + end + sizes + end end # Public: Get primary Language of repository. # # Returns a Language def language - compute_stats - @language + @language ||= begin + primary = languages.max_by { |(_, size)| size } + primary && primary[0] + end end # Public: Get the total size of the repository. # # Returns a byte size Integer def size - compute_stats - @size + @size ||= languages.inject(0) { |s,(_,v)| s + v } end # Public: Return the language breakdown of this repository by file def breakdown_by_file - compute_stats - @file_breakdown + @file_breakdown ||= begin + breakdown = Hash.new { |h,k| h[k] = Array.new } + file_map.each do |filename, (language, _)| + breakdown[language.name] << filename + end + breakdown + end + end + + def incremental_stats(old_sha1, new_sha1, file_map = nil) + file_map = file_map ? file_map.dup : {} + old_commit = old_sha1 && Rugged::Commit.lookup(repository, old_sha1) + new_commit = Rugged::Commit.lookup(repository, new_sha1) + + diff = Rugged::Tree.diff(repository, old_commit, new_commit) + + diff.each_delta do |delta| + old = delta.old_file[:path] + new = delta.new_file[:path] + + file_map.delete(old) + next if delta.binary + + if [:added, :modified].include? delta.status + blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, delta.new_file[:mode]) + + # Skip vendored or generated blobs + next if blob.vendored? || blob.generated? || blob.language.nil? + + # Only include programming languages and acceptable markup languages + if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name) + file_map[new] = [blob.language.group, blob.size] + end + end + end + + file_map + end + + def load_stats(file) + @old_sha1, @old_stats = JSON.load(file) + end + + def dump_stats(file) + JSON.dump([@current_sha1, file_map], file) end # Internal: Compute language breakdown for each blob in the Repository. # # Returns nothing - def compute_stats - return if @computed_stats - - @enum.each do |blob| - # Skip files that are likely binary - next if blob.likely_binary? - - # Skip vendored or generated blobs - next if blob.vendored? || blob.generated? || blob.language.nil? - - # Only include programming languages and acceptable markup languages - if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name) - - # Build up the per-file breakdown stats - @file_breakdown[blob.language.group.name] << blob.name - - @sizes[blob.language.group] += blob.size - end - end - - # Compute total size - @size = @sizes.inject(0) { |s,(_,v)| s + v } - - # Get primary language - if primary = @sizes.max_by { |(_, size)| size } - @language = primary[0] - end - - @computed_stats = true - - nil + def file_map + @file_map ||= if @old_sha1 == @current_sha1 + @old_stats + else + incremental_stats(@old_sha1, @current_sha1, @old_stats) + end end end end diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index 0c1a07ff..33f8a087 100644 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -34,11 +34,6 @@ class TestHeuristcs < Test::Unit::TestCase assert_equal Language["C++"], results.first end - def test_detect_still_works_if_nothing_matches - match = Language.detect("Hello.m", fixture("Objective-C/hello.m")) - assert_equal Language["Objective-C"], match - end - def test_pl_prolog_by_heuristics languages = ["Perl", "Prolog"] results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages) diff --git a/test/test_repository.rb b/test/test_repository.rb index 832489d3..13dbdaef 100644 --- a/test/test_repository.rb +++ b/test/test_repository.rb @@ -5,12 +5,9 @@ require 'test/unit' class TestRepository < Test::Unit::TestCase include Linguist - def repo(base_path) - Repository.from_directory(base_path) - end - def linguist_repo - repo(File.expand_path("../..", __FILE__)) + r = Rugged::Repository.new(File.expand_path("../../.git", __FILE__)) + Linguist::Repository.new(r, '31921838cdc252536ec07668f73d4b64d8022750') end def test_linguist_language @@ -30,8 +27,4 @@ class TestRepository < Test::Unit::TestCase assert linguist_repo.breakdown_by_file["Ruby"].include?("bin/linguist") assert linguist_repo.breakdown_by_file["Ruby"].include?("lib/linguist/language.rb") end - - def test_binary_override - assert_equal repo(File.expand_path("../../samples/Nimrod", __FILE__)).language, Language["Nimrod"] - end end From cd58a30c7c29fb1a0cd4331eb7d1f36ce445bbc7 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Mon, 16 Jun 2014 16:41:58 +0200 Subject: [PATCH 02/20] Only cache strings, thanks --- lib/linguist/repository.rb | 37 ++++++++++++++----------------------- lib/linguist/version.rb | 2 +- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index bbf32171..960ae3df 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -31,7 +31,7 @@ module Linguist def languages @sizes ||= begin sizes = Hash.new { 0 } - file_map.each do |_, (language, size)| + cache.each do |_, (language, size)| sizes[language] += size end sizes @@ -59,15 +59,15 @@ module Linguist def breakdown_by_file @file_breakdown ||= begin breakdown = Hash.new { |h,k| h[k] = Array.new } - file_map.each do |filename, (language, _)| - breakdown[language.name] << filename + cache.each do |filename, (language, _)| + breakdown[language] << filename end breakdown end end - def incremental_stats(old_sha1, new_sha1, file_map = nil) - file_map = file_map ? file_map.dup : {} + def incremental_stats(old_sha1, new_sha1, cache = nil) + file_map = cache ? cache.dup : {} old_commit = old_sha1 && Rugged::Commit.lookup(repository, old_sha1) new_commit = Rugged::Commit.lookup(repository, new_sha1) @@ -88,7 +88,7 @@ module Linguist # Only include programming languages and acceptable markup languages if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name) - file_map[new] = [blob.language.group, blob.size] + file_map[new] = [blob.language.group.name, blob.size] end end end @@ -96,23 +96,14 @@ module Linguist file_map end - def load_stats(file) - @old_sha1, @old_stats = JSON.load(file) - end - - def dump_stats(file) - JSON.dump([@current_sha1, file_map], file) - end - - # Internal: Compute language breakdown for each blob in the Repository. - # - # Returns nothing - def file_map - @file_map ||= if @old_sha1 == @current_sha1 - @old_stats - else - incremental_stats(@old_sha1, @current_sha1, @old_stats) - end + def cache + @cache ||= begin + if @old_sha1 == @current_sha1 + @old_stats + else + incremental_stats(@old_sha1, @current_sha1, @old_stats) + end + end end end end diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index 6704b3fb..ed7d57dc 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "2.12.0" + VERSION = "2.13.0.github2" end From 463f48f04f4f2137222044ba42d5dc7175f5dbf2 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Mon, 16 Jun 2014 17:31:44 +0200 Subject: [PATCH 03/20] Mode must always be a String --- lib/linguist/file_blob.rb | 4 ++-- lib/linguist/language.rb | 2 +- lib/linguist/repository.rb | 3 ++- lib/linguist/version.rb | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/linguist/file_blob.rb b/lib/linguist/file_blob.rb index e84e9b61..9123922f 100644 --- a/lib/linguist/file_blob.rb +++ b/lib/linguist/file_blob.rb @@ -34,9 +34,9 @@ module Linguist # Public: Read file permissions # - # Returns an Int like 0100644 + # Returns a String like '0100644' def mode - File.stat(@path).mode + File.stat(@path).mode.to_s(8) end # Public: Read file contents. diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 3a354463..bdaa91e8 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -99,7 +99,7 @@ module Linguist # A bit of an elegant hack. If the file is executable but extensionless, # append a "magic" extension so it can be classified with other # languages that have shebang scripts. - if File.extname(name).empty? && blob.mode && (blob.mode & 05) == 05 + if File.extname(name).empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05 name += ".script!" end diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index 960ae3df..9fb6ebc7 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -81,7 +81,8 @@ module Linguist next if delta.binary if [:added, :modified].include? delta.status - blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, delta.new_file[:mode]) + mode = delta.new_file[:mode].to_s(8) + blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, mode) # Skip vendored or generated blobs next if blob.vendored? || blob.generated? || blob.language.nil? diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index ed7d57dc..c47a92f3 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "2.13.0.github2" + VERSION = "2.13.0.github3" end From ea1fc90cf5610acf6dd987af324ceab7ed78dbc0 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Mon, 16 Jun 2014 18:41:51 +0200 Subject: [PATCH 04/20] Handle `nil` blob names --- lib/linguist/file_blob.rb | 2 +- lib/linguist/language.rb | 2 +- lib/linguist/version.rb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/linguist/file_blob.rb b/lib/linguist/file_blob.rb index 9123922f..7e7f1acd 100644 --- a/lib/linguist/file_blob.rb +++ b/lib/linguist/file_blob.rb @@ -34,7 +34,7 @@ module Linguist # Public: Read file permissions # - # Returns a String like '0100644' + # Returns a String like '100644' def mode File.stat(@path).mode.to_s(8) end diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index bdaa91e8..ffbecb7b 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -94,7 +94,7 @@ module Linguist # # Returns Language or nil. def self.detect(blob) - name = blob.name + name = blob.name.to_s # A bit of an elegant hack. If the file is executable but extensionless, # append a "magic" extension so it can be classified with other diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index c47a92f3..232278b4 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "2.13.0.github3" + VERSION = "2.13.0.github4" end From 5896bb8fa35d813902dff6a7495f3da3ec2f32f2 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 24 Jun 2014 17:52:43 +0200 Subject: [PATCH 05/20] Missing file. Duh. --- lib/linguist/lazy_blob.rb | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 lib/linguist/lazy_blob.rb diff --git a/lib/linguist/lazy_blob.rb b/lib/linguist/lazy_blob.rb new file mode 100644 index 00000000..bb262241 --- /dev/null +++ b/lib/linguist/lazy_blob.rb @@ -0,0 +1,37 @@ +require 'linguist/blob_helper' +require 'rugged' + +module Linguist + class LazyBlob + include BlobHelper + + MAX_SIZE = 128 * 1024 + + attr_reader :repository + attr_reader :oid + attr_reader :name + attr_reader :mode + + def initialize(repo, oid, name, mode = nil) + @repository = repo + @oid = oid + @name = name + @mode = mode + end + + def data + load_blob! + @data + end + + def size + load_blob! + @size + end + + protected + def load_blob! + @data, @size = Rugged::Blob.to_buffer(repository, oid, MAX_SIZE) if @data.nil? + end + end +end From 1fd59361b5c4ce323d633261377a9175d3f8f1f9 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 25 Jun 2014 20:26:44 +0200 Subject: [PATCH 06/20] Proper incremental diffing --- lib/linguist/repository.rb | 18 +++++++++--------- test/test_repository.rb | 33 +++++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index 9fb6ebc7..e11d2b23 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -14,10 +14,10 @@ module Linguist # Public: Initialize a new Repository # # Returns a Repository - def initialize(repo, sha1, existing_stats = nil) + def initialize(repo, commit_oid, existing_stats = nil) @repository = repo - @current_sha1 = sha1 - @old_sha1, @old_stats = existing_stats if existing_stats + @commit_oid = commit_oid + @old_commit_oid, @old_stats = existing_stats if existing_stats end # Public: Returns a breakdown of language stats. @@ -66,12 +66,12 @@ module Linguist end end - def incremental_stats(old_sha1, new_sha1, cache = nil) + def compute_stats(old_commit_oid, commit_oid, cache = nil) file_map = cache ? cache.dup : {} - old_commit = old_sha1 && Rugged::Commit.lookup(repository, old_sha1) - new_commit = Rugged::Commit.lookup(repository, new_sha1) + old_tree = old_commit_oid && Rugged::Commit.lookup(repository, old_commit_oid).tree + new_tree = Rugged::Commit.lookup(repository, commit_oid).tree - diff = Rugged::Tree.diff(repository, old_commit, new_commit) + diff = Rugged::Tree.diff(repository, old_tree, new_tree) diff.each_delta do |delta| old = delta.old_file[:path] @@ -99,10 +99,10 @@ module Linguist def cache @cache ||= begin - if @old_sha1 == @current_sha1 + if @old_commit_oid == @commit_oid @old_stats else - incremental_stats(@old_sha1, @current_sha1, @old_stats) + compute_stats(@old_commit_oid, @commit_oid, @old_stats) end end end diff --git a/test/test_repository.rb b/test/test_repository.rb index 13dbdaef..bd96e66d 100644 --- a/test/test_repository.rb +++ b/test/test_repository.rb @@ -3,19 +3,24 @@ require 'linguist/repository' require 'test/unit' class TestRepository < Test::Unit::TestCase - include Linguist + def rugged_repository + @rugged ||= Rugged::Repository.new(File.expand_path("../../.git", __FILE__)) + end - def linguist_repo - r = Rugged::Repository.new(File.expand_path("../../.git", __FILE__)) - Linguist::Repository.new(r, '31921838cdc252536ec07668f73d4b64d8022750') + def master_oid + @master_oid ||= Rugged::Object.rev_parse_oid(rugged_repository, 'master') + end + + def linguist_repo(oid = master_oid) + Linguist::Repository.new(rugged_repository, oid) end def test_linguist_language - # assert_equal Language['Ruby'], linguist_repo.language + assert_equal 'Ruby', linguist_repo.language end def test_linguist_languages - # assert linguist_repo.languages[Language['Ruby']] > 10_000 + assert linguist_repo.languages['Ruby'] > 10_000 end def test_linguist_size @@ -27,4 +32,20 @@ class TestRepository < Test::Unit::TestCase assert linguist_repo.breakdown_by_file["Ruby"].include?("bin/linguist") assert linguist_repo.breakdown_by_file["Ruby"].include?("lib/linguist/language.rb") end + + def test_incremental_stats + old_commit = Rugged::Object.rev_parse_oid(rugged_repository, 'v2.0.0') + old_repo = linguist_repo(old_commit) + + assert old_repo.languages['Ruby'] > 10_000 + assert old_repo.size > 30_000 + + old_cache = [old_commit, old_repo.cache] + new_repo = Linguist::Repository.new(rugged_repository, master_oid, old_cache) + + assert new_repo.languages['Ruby'] > old_repo.languages['Ruby'] + assert new_repo.size > old_repo.size + + assert_equal linguist_repo.cache, new_repo.cache + end end From 29072d6eae88b31c56cf7604e5c2aeb39afbcbda Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 26 Jun 2014 12:27:02 +0200 Subject: [PATCH 07/20] Fix travis build --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 83880550..3a5791da 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,6 @@ before_install: + - git fetch origin master:master + - git fetch origin v2.0.0:v2.0.0 - sudo apt-get install libicu-dev -y - gem update --system 2.1.11 rvm: From 659d27cae589c9cb5a085239dd3833a8cede9462 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 26 Jun 2014 12:54:08 +0200 Subject: [PATCH 08/20] DOCS --- lib/linguist/repository.rb | 74 ++++++++++++++++++++++++++++---------- test/test_repository.rb | 3 +- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index e11d2b23..82e9ef47 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -1,4 +1,3 @@ -require 'linguist/file_blob' require 'linguist/lazy_blob' require 'rugged' @@ -11,23 +10,53 @@ module Linguist class Repository attr_reader :repository - # Public: Initialize a new Repository + # Public: Create a new Repository based on the stats of + # an existing one + def self.incremental(repo, commit_oid, old_commit_oid, old_stats) + repo = self.new(repo, commit_oid) + repo.load_existing_stats(old_commit_oid, old_stats) + repo + end + + # Public: Initialize a new Repository to be analyzed for language + # data + # + # repo - a Rugged::Repository object + # commit_oid - the sha1 of the commit that will be analyzed; + # this is usually the master branch # # Returns a Repository - def initialize(repo, commit_oid, existing_stats = nil) + def initialize(repo, commit_oid) @repository = repo @commit_oid = commit_oid - @old_commit_oid, @old_stats = existing_stats if existing_stats + end + + # Public: Load the results of a previous analysis on this repository + # to speed up the new scan. + # + # The new analysis will be performed incrementally as to only take + # into account the file changes since the last time the repository + # was scanned + # + # old_commit_oid - the sha1 of the commit that was previously analyzed + # old_stats - the result of the previous analysis, obtained by calling + # Repository#cache on the old repository + # + # Returns nothing + def load_existing_stats(old_commit_oid, old_stats) + @old_commit_oid = old_commit_oid + @old_stats = old_stats + nil end # Public: Returns a breakdown of language stats. # # Examples # - # # => { Language['Ruby'] => 46319, - # Language['JavaScript'] => 258 } + # # => { 'Ruby' => 46319, + # 'JavaScript' => 258 } # - # Returns a Hash of Language keys and Integer size values. + # Returns a Hash of language names and Integer size values. def languages @sizes ||= begin sizes = Hash.new { 0 } @@ -40,7 +69,7 @@ module Linguist # Public: Get primary Language of repository. # - # Returns a Language + # Returns a language name def language @language ||= begin primary = languages.max_by { |(_, size)| size } @@ -56,6 +85,8 @@ module Linguist end # Public: Return the language breakdown of this repository by file + # + # Returns a map of language names => [filenames...] def breakdown_by_file @file_breakdown ||= begin breakdown = Hash.new { |h,k| h[k] = Array.new } @@ -66,6 +97,23 @@ module Linguist end end + # Public: Return the cached results of the analysis + # + # This is a per-file breakdown that can be passed to other instances + # of Linguist::Repository to perform incremental scans + # + # Returns a map of filename => [language, size] + def cache + @cache ||= begin + if @old_commit_oid == @commit_oid + @old_stats + else + compute_stats(@old_commit_oid, @commit_oid, @old_stats) + end + end + end + + protected def compute_stats(old_commit_oid, commit_oid, cache = nil) file_map = cache ? cache.dup : {} old_tree = old_commit_oid && Rugged::Commit.lookup(repository, old_commit_oid).tree @@ -96,15 +144,5 @@ module Linguist file_map end - - def cache - @cache ||= begin - if @old_commit_oid == @commit_oid - @old_stats - else - compute_stats(@old_commit_oid, @commit_oid, @old_stats) - end - end - end end end diff --git a/test/test_repository.rb b/test/test_repository.rb index bd96e66d..6637f51f 100644 --- a/test/test_repository.rb +++ b/test/test_repository.rb @@ -40,8 +40,7 @@ class TestRepository < Test::Unit::TestCase assert old_repo.languages['Ruby'] > 10_000 assert old_repo.size > 30_000 - old_cache = [old_commit, old_repo.cache] - new_repo = Linguist::Repository.new(rugged_repository, master_oid, old_cache) + new_repo = Linguist::Repository.incremental(rugged_repository, master_oid, old_commit, old_repo.cache) assert new_repo.languages['Ruby'] > old_repo.languages['Ruby'] assert new_repo.size > old_repo.size From bc34345a56f977ac070675b57ad01fd76cf5778b Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 26 Jun 2014 13:03:30 +0200 Subject: [PATCH 09/20] Fix the linguist binary --- bin/linguist | 4 +++- lib/linguist/repository.rb | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/linguist b/bin/linguist index 2cfa8064..e086dcea 100755 --- a/bin/linguist +++ b/bin/linguist @@ -5,6 +5,7 @@ require 'linguist/file_blob' require 'linguist/repository' +require 'rugged' path = ARGV[0] || Dir.pwd @@ -18,7 +19,8 @@ ARGV.shift breakdown = true if ARGV[0] == "--breakdown" if File.directory?(path) - repo = Linguist::Repository.from_directory(path) + rugged = Rugged::Repository.new(path) + repo = Linguist::Repository.new(rugged, rugged.head.target_id) repo.languages.sort_by { |_, size| size }.reverse.each do |language, size| percentage = ((size / repo.size.to_f) * 100) percentage = sprintf '%.2f' % percentage diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index 82e9ef47..e3d2d707 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -29,6 +29,8 @@ module Linguist def initialize(repo, commit_oid) @repository = repo @commit_oid = commit_oid + + raise TypeError, 'commit_oid must be a commit SHA1' unless commit_oid.is_a?(String) end # Public: Load the results of a previous analysis on this repository From 00a873dcc7ed88201b2a502f703e04784ddeccb8 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 26 Jun 2014 13:03:41 +0200 Subject: [PATCH 10/20] Bump 3.0.0b0 --- lib/linguist/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index 232278b4..42e0eb95 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "2.13.0.github4" + VERSION = "3.0.0b0" end From 324ac834891a31b4738cd7a8f17da2d2db93075d Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 26 Jun 2014 14:12:00 +0200 Subject: [PATCH 11/20] Use the new Rugged release --- Gemfile | 1 - github-linguist.gemspec | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index 3ac3ffa4..fa75df15 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,3 @@ source 'https://rubygems.org' gemspec -gem 'rugged', :git => 'https://github.com/libgit2/rugged.git', branch: 'development', submodules: true diff --git a/github-linguist.gemspec b/github-linguist.gemspec index d4c2337a..936550f3 100644 --- a/github-linguist.gemspec +++ b/github-linguist.gemspec @@ -17,6 +17,7 @@ Gem::Specification.new do |s| s.add_dependency 'escape_utils', '~> 1.0.1' s.add_dependency 'mime-types', '~> 1.19' s.add_dependency 'pygments.rb', '~> 0.6.0' + s.add_dependency 'rugged', '~> 0.21.0' s.add_development_dependency 'json' s.add_development_dependency 'mocha' From 907d3c5a3676daee59e46cfbf43a716822c47199 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 26 Jun 2014 18:17:51 +0200 Subject: [PATCH 12/20] b1 --- lib/linguist/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index 42e0eb95..c5c3085d 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "3.0.0b0" + VERSION = "3.0.0b1" end From 621042e63936fbdecf1e526e26ef3ada91e3b7e0 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 26 Jun 2014 18:42:43 +0200 Subject: [PATCH 13/20] Remove whitespace --- Gemfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Gemfile b/Gemfile index fa75df15..851fabc2 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,2 @@ source 'https://rubygems.org' - gemspec From 12429b90fe02171cc035d16164ca723bb83f2213 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 26 Jun 2014 21:24:30 +0200 Subject: [PATCH 14/20] Bring back missing test --- test/test_heuristics.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index 33f8a087..8bbf0695 100644 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -1,6 +1,7 @@ require 'linguist/heuristics' require 'linguist/language' require 'linguist/samples' +require 'linguist/file_blob' require 'test/unit' @@ -34,6 +35,12 @@ class TestHeuristcs < Test::Unit::TestCase assert_equal Language["C++"], results.first end + def test_detect_still_works_if_nothing_matches + blob = Linguist::FileBlob.new(File.join(samples_path, "Objective-C/hello.m")) + match = Language.detect(blob) + assert_equal Language["Objective-C"], match + end + def test_pl_prolog_by_heuristics languages = ["Perl", "Prolog"] results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages) From 65eaf98d0badbb5f6012f9d5bf947e418e28f9a0 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 26 Jun 2014 21:26:26 +0200 Subject: [PATCH 15/20] docs --- lib/linguist/language.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index ffbecb7b..ed9b68e9 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -92,6 +92,9 @@ module Linguist # Public: Detects the Language of the blob. # + # blob - an object that implements the Linguist `Blob` interface; + # see Linguist::LazyBlob and Linguist::FileBlob for examples + # # Returns Language or nil. def self.detect(blob) name = blob.name.to_s From d206131df0df57c9a77b88b4a6709ae65a8075b1 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Fri, 27 Jun 2014 13:51:37 +0200 Subject: [PATCH 16/20] Hardcode OIDs for test --- test/test_repository.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_repository.rb b/test/test_repository.rb index 6637f51f..f4c5ad70 100644 --- a/test/test_repository.rb +++ b/test/test_repository.rb @@ -8,7 +8,7 @@ class TestRepository < Test::Unit::TestCase end def master_oid - @master_oid ||= Rugged::Object.rev_parse_oid(rugged_repository, 'master') + 'd40b4a33deba710e2f494db357c654fbe5d4b419' end def linguist_repo(oid = master_oid) @@ -34,7 +34,7 @@ class TestRepository < Test::Unit::TestCase end def test_incremental_stats - old_commit = Rugged::Object.rev_parse_oid(rugged_repository, 'v2.0.0') + old_commit = '3d7364877d6794f6cc2a86b493e893968a597332' old_repo = linguist_repo(old_commit) assert old_repo.languages['Ruby'] > 10_000 From 32828a9af56d748225609e55e99e5c5df4c041cc Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Fri, 27 Jun 2014 13:51:56 +0200 Subject: [PATCH 17/20] b2 --- lib/linguist/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index c5c3085d..eb7bbeb7 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "3.0.0b1" + VERSION = "3.0.0b2" end From d9be472ccb22ef2ccb5b131a80757bb9ab6c6b0c Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Fri, 27 Jun 2014 16:41:23 +0200 Subject: [PATCH 18/20] Skip submodules when diffing --- lib/linguist/repository.rb | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index e3d2d707..a89c81e6 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -131,8 +131,11 @@ module Linguist next if delta.binary if [:added, :modified].include? delta.status - mode = delta.new_file[:mode].to_s(8) - blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, mode) + # Skip submodules + mode = delta.new_file[:mode] + next if (mode & 040000) != 0 + + blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, mode.to_s(8)) # Skip vendored or generated blobs next if blob.vendored? || blob.generated? || blob.language.nil? From df09a746a0d76cbe2ebe8858b69cd0e3b61141c4 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Fri, 27 Jun 2014 16:57:58 +0200 Subject: [PATCH 19/20] b3 --- lib/linguist/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index eb7bbeb7..ab1a401e 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "3.0.0b2" + VERSION = "3.0.0b3" end From 9281bd043aec088e656fece3bc5b689fd1af5dc7 Mon Sep 17 00:00:00 2001 From: Arfon Smith Date: Tue, 1 Jul 2014 11:19:05 -0500 Subject: [PATCH 20/20] Version --- lib/linguist/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index ab1a401e..c77f7734 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "3.0.0b3" + VERSION = "3.0.0" end