diff --git a/Gemfile b/Gemfile index 851fabc2..3ac3ffa4 100644 --- a/Gemfile +++ b/Gemfile @@ -1,2 +1,4 @@ source 'https://rubygems.org' + gemspec +gem 'rugged', :git => 'https://github.com/libgit2/rugged.git', branch: 'development', submodules: true diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 15ab2d9f..27c4b3dd 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -313,15 +313,7 @@ module Linguist # # Returns a Language or nil if none is detected def language - return @language if defined? @language - - if defined?(@data) && @data.is_a?(String) - data = @data - else - data = lambda { (binary_mime_type? || binary?) ? "" : self.data } - end - - @language = Language.detect(name.to_s, data, mode) + @language ||= Language.detect(self) end # Internal: Get the lexer of the blob. diff --git a/lib/linguist/file_blob.rb b/lib/linguist/file_blob.rb index 7e7f1acd..e84e9b61 100644 --- a/lib/linguist/file_blob.rb +++ b/lib/linguist/file_blob.rb @@ -34,9 +34,9 @@ module Linguist # Public: Read file permissions # - # Returns a String like '100644' + # Returns an Int like 0100644 def mode - File.stat(@path).mode.to_s(8) + File.stat(@path).mode end # Public: Read file contents. diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index a8e7a33c..3a354463 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -92,18 +92,14 @@ module Linguist # Public: Detects the Language of the blob. # - # name - String filename - # data - String blob data. A block also maybe passed in for lazy - # loading. This behavior is deprecated and you should always - # pass in a String. - # mode - Optional String mode (defaults to nil) - # # Returns Language or nil. - def self.detect(name, data, mode = nil) + def self.detect(blob) + name = blob.name + # A bit of an elegant hack. If the file is executable but extensionless, # append a "magic" extension so it can be classified with other # languages that have shebang scripts. - if File.extname(name).empty? && mode && (mode.to_i(8) & 05) == 05 + if File.extname(name).empty? && blob.mode && (blob.mode & 05) == 05 name += ".script!" end @@ -114,7 +110,7 @@ module Linguist # extension at all, in the case of extensionless scripts), we need to continue # our detection work if possible_languages.length > 1 - data = data.call() if data.respond_to?(:call) + data = blob.data possible_language_names = possible_languages.map(&:name) # Don't bother with emptiness diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index a62bf281..bbf32171 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -1,4 +1,6 @@ require 'linguist/file_blob' +require 'linguist/lazy_blob' +require 'rugged' module Linguist # A Repository is an abstraction of a Grit::Repo or a basic file @@ -7,29 +9,15 @@ module Linguist # Its primary purpose is for gathering language statistics across # the entire project. class Repository - # Public: Initialize a new Repository from a File directory - # - # base_path - A path String - # - # Returns a Repository - def self.from_directory(base_path) - new Dir["#{base_path}/**/*"]. - select { |f| File.file?(f) }. - map { |path| FileBlob.new(path, base_path) } - end + attr_reader :repository # Public: Initialize a new Repository # - # enum - Enumerator that responds to `each` and - # yields Blob objects - # # Returns a Repository - def initialize(enum) - @enum = enum - @computed_stats = false - @language = @size = nil - @sizes = Hash.new { 0 } - @file_breakdown = Hash.new { |h,k| h[k] = Array.new } + def initialize(repo, sha1, existing_stats = nil) + @repository = repo + @current_sha1 = sha1 + @old_sha1, @old_stats = existing_stats if existing_stats end # Public: Returns a breakdown of language stats. @@ -41,66 +29,90 @@ module Linguist # # Returns a Hash of Language keys and Integer size values. def languages - compute_stats - @sizes + @sizes ||= begin + sizes = Hash.new { 0 } + file_map.each do |_, (language, size)| + sizes[language] += size + end + sizes + end end # Public: Get primary Language of repository. # # Returns a Language def language - compute_stats - @language + @language ||= begin + primary = languages.max_by { |(_, size)| size } + primary && primary[0] + end end # Public: Get the total size of the repository. # # Returns a byte size Integer def size - compute_stats - @size + @size ||= languages.inject(0) { |s,(_,v)| s + v } end # Public: Return the language breakdown of this repository by file def breakdown_by_file - compute_stats - @file_breakdown + @file_breakdown ||= begin + breakdown = Hash.new { |h,k| h[k] = Array.new } + file_map.each do |filename, (language, _)| + breakdown[language.name] << filename + end + breakdown + end + end + + def incremental_stats(old_sha1, new_sha1, file_map = nil) + file_map = file_map ? file_map.dup : {} + old_commit = old_sha1 && Rugged::Commit.lookup(repository, old_sha1) + new_commit = Rugged::Commit.lookup(repository, new_sha1) + + diff = Rugged::Tree.diff(repository, old_commit, new_commit) + + diff.each_delta do |delta| + old = delta.old_file[:path] + new = delta.new_file[:path] + + file_map.delete(old) + next if delta.binary + + if [:added, :modified].include? delta.status + blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, delta.new_file[:mode]) + + # Skip vendored or generated blobs + next if blob.vendored? || blob.generated? || blob.language.nil? + + # Only include programming languages and acceptable markup languages + if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name) + file_map[new] = [blob.language.group, blob.size] + end + end + end + + file_map + end + + def load_stats(file) + @old_sha1, @old_stats = JSON.load(file) + end + + def dump_stats(file) + JSON.dump([@current_sha1, file_map], file) end # Internal: Compute language breakdown for each blob in the Repository. # # Returns nothing - def compute_stats - return if @computed_stats - - @enum.each do |blob| - # Skip files that are likely binary - next if blob.likely_binary? - - # Skip vendored or generated blobs - next if blob.vendored? || blob.generated? || blob.language.nil? - - # Only include programming languages and acceptable markup languages - if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name) - - # Build up the per-file breakdown stats - @file_breakdown[blob.language.group.name] << blob.name - - @sizes[blob.language.group] += blob.size - end - end - - # Compute total size - @size = @sizes.inject(0) { |s,(_,v)| s + v } - - # Get primary language - if primary = @sizes.max_by { |(_, size)| size } - @language = primary[0] - end - - @computed_stats = true - - nil + def file_map + @file_map ||= if @old_sha1 == @current_sha1 + @old_stats + else + incremental_stats(@old_sha1, @current_sha1, @old_stats) + end end end end diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index 0c1a07ff..33f8a087 100644 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -34,11 +34,6 @@ class TestHeuristcs < Test::Unit::TestCase assert_equal Language["C++"], results.first end - def test_detect_still_works_if_nothing_matches - match = Language.detect("Hello.m", fixture("Objective-C/hello.m")) - assert_equal Language["Objective-C"], match - end - def test_pl_prolog_by_heuristics languages = ["Perl", "Prolog"] results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages) diff --git a/test/test_repository.rb b/test/test_repository.rb index 832489d3..13dbdaef 100644 --- a/test/test_repository.rb +++ b/test/test_repository.rb @@ -5,12 +5,9 @@ require 'test/unit' class TestRepository < Test::Unit::TestCase include Linguist - def repo(base_path) - Repository.from_directory(base_path) - end - def linguist_repo - repo(File.expand_path("../..", __FILE__)) + r = Rugged::Repository.new(File.expand_path("../../.git", __FILE__)) + Linguist::Repository.new(r, '31921838cdc252536ec07668f73d4b64d8022750') end def test_linguist_language @@ -30,8 +27,4 @@ class TestRepository < Test::Unit::TestCase assert linguist_repo.breakdown_by_file["Ruby"].include?("bin/linguist") assert linguist_repo.breakdown_by_file["Ruby"].include?("lib/linguist/language.rb") end - - def test_binary_override - assert_equal repo(File.expand_path("../../samples/Nimrod", __FILE__)).language, Language["Nimrod"] - end end