diff --git a/.travis.yml b/.travis.yml index 83880550..3a5791da 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,6 @@ before_install: + - git fetch origin master:master + - git fetch origin v2.0.0:v2.0.0 - sudo apt-get install libicu-dev -y - gem update --system 2.1.11 rvm: diff --git a/README.md b/README.md index 660ac00c..d497e8c2 100644 --- a/README.md +++ b/README.md @@ -152,4 +152,4 @@ If you are the current maintainer of this gem: 0. Test behavior locally, branch deploy, whatever needs to happen 0. Merge github/linguist PR 0. Tag and push: `git tag vx.xx.xx; git push --tags` - 0. Push to rubygems.org -- `gem push github-linguist-2.10.12.gem` + 0. Push to rubygems.org -- `gem push github-linguist-3.0.0.gem` diff --git a/bin/linguist b/bin/linguist index 2cfa8064..e086dcea 100755 --- a/bin/linguist +++ b/bin/linguist @@ -5,6 +5,7 @@ require 'linguist/file_blob' require 'linguist/repository' +require 'rugged' path = ARGV[0] || Dir.pwd @@ -18,7 +19,8 @@ ARGV.shift breakdown = true if ARGV[0] == "--breakdown" if File.directory?(path) - repo = Linguist::Repository.from_directory(path) + rugged = Rugged::Repository.new(path) + repo = Linguist::Repository.new(rugged, rugged.head.target_id) repo.languages.sort_by { |_, size| size }.reverse.each do |language, size| percentage = ((size / repo.size.to_f) * 100) percentage = sprintf '%.2f' % percentage diff --git a/github-linguist.gemspec b/github-linguist.gemspec index d4c2337a..936550f3 100644 --- a/github-linguist.gemspec +++ b/github-linguist.gemspec @@ -17,6 +17,7 @@ Gem::Specification.new do |s| s.add_dependency 'escape_utils', '~> 1.0.1' s.add_dependency 'mime-types', '~> 1.19' s.add_dependency 'pygments.rb', '~> 0.6.0' + s.add_dependency 'rugged', '~> 0.21.0' s.add_development_dependency 'json' s.add_development_dependency 'mocha' diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 15ab2d9f..27c4b3dd 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -313,15 +313,7 @@ module Linguist # # Returns a Language or nil if none is detected def language - return @language if defined? @language - - if defined?(@data) && @data.is_a?(String) - data = @data - else - data = lambda { (binary_mime_type? || binary?) ? "" : self.data } - end - - @language = Language.detect(name.to_s, data, mode) + @language ||= Language.detect(self) end # Internal: Get the lexer of the blob. diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index a8e7a33c..ed9b68e9 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -92,18 +92,17 @@ module Linguist # Public: Detects the Language of the blob. # - # name - String filename - # data - String blob data. A block also maybe passed in for lazy - # loading. This behavior is deprecated and you should always - # pass in a String. - # mode - Optional String mode (defaults to nil) + # blob - an object that implements the Linguist `Blob` interface; + # see Linguist::LazyBlob and Linguist::FileBlob for examples # # Returns Language or nil. - def self.detect(name, data, mode = nil) + def self.detect(blob) + name = blob.name.to_s + # A bit of an elegant hack. If the file is executable but extensionless, # append a "magic" extension so it can be classified with other # languages that have shebang scripts. - if File.extname(name).empty? && mode && (mode.to_i(8) & 05) == 05 + if File.extname(name).empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05 name += ".script!" end @@ -114,7 +113,7 @@ module Linguist # extension at all, in the case of extensionless scripts), we need to continue # our detection work if possible_languages.length > 1 - data = data.call() if data.respond_to?(:call) + data = blob.data possible_language_names = possible_languages.map(&:name) # Don't bother with emptiness diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index fe5e30cd..e2586022 100644 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -89,7 +89,7 @@ Agda: Alloy: type: programming # 'modeling' would be more appropiate - lexer: Text only + lexer: Alloy color: "#cc5c24" extensions: - .als @@ -222,6 +222,8 @@ BlitzBasic: - .decls BlitzMax: + type: programming + color: "#cd6400" extensions: - .bmx @@ -1544,6 +1546,8 @@ PHP: - .phpt filenames: - Phakefile + interpreters: + - php Pan: type: programming diff --git a/lib/linguist/lazy_blob.rb b/lib/linguist/lazy_blob.rb new file mode 100644 index 00000000..bb262241 --- /dev/null +++ b/lib/linguist/lazy_blob.rb @@ -0,0 +1,37 @@ +require 'linguist/blob_helper' +require 'rugged' + +module Linguist + class LazyBlob + include BlobHelper + + MAX_SIZE = 128 * 1024 + + attr_reader :repository + attr_reader :oid + attr_reader :name + attr_reader :mode + + def initialize(repo, oid, name, mode = nil) + @repository = repo + @oid = oid + @name = name + @mode = mode + end + + def data + load_blob! + @data + end + + def size + load_blob! + @size + end + + protected + def load_blob! + @data, @size = Rugged::Blob.to_buffer(repository, oid, MAX_SIZE) if @data.nil? + end + end +end diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index a62bf281..a89c81e6 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -1,4 +1,5 @@ -require 'linguist/file_blob' +require 'linguist/lazy_blob' +require 'rugged' module Linguist # A Repository is an abstraction of a Grit::Repo or a basic file @@ -7,100 +8,146 @@ module Linguist # Its primary purpose is for gathering language statistics across # the entire project. class Repository - # Public: Initialize a new Repository from a File directory - # - # base_path - A path String - # - # Returns a Repository - def self.from_directory(base_path) - new Dir["#{base_path}/**/*"]. - select { |f| File.file?(f) }. - map { |path| FileBlob.new(path, base_path) } + attr_reader :repository + + # Public: Create a new Repository based on the stats of + # an existing one + def self.incremental(repo, commit_oid, old_commit_oid, old_stats) + repo = self.new(repo, commit_oid) + repo.load_existing_stats(old_commit_oid, old_stats) + repo end - # Public: Initialize a new Repository + # Public: Initialize a new Repository to be analyzed for language + # data # - # enum - Enumerator that responds to `each` and - # yields Blob objects + # repo - a Rugged::Repository object + # commit_oid - the sha1 of the commit that will be analyzed; + # this is usually the master branch # # Returns a Repository - def initialize(enum) - @enum = enum - @computed_stats = false - @language = @size = nil - @sizes = Hash.new { 0 } - @file_breakdown = Hash.new { |h,k| h[k] = Array.new } + def initialize(repo, commit_oid) + @repository = repo + @commit_oid = commit_oid + + raise TypeError, 'commit_oid must be a commit SHA1' unless commit_oid.is_a?(String) + end + + # Public: Load the results of a previous analysis on this repository + # to speed up the new scan. + # + # The new analysis will be performed incrementally as to only take + # into account the file changes since the last time the repository + # was scanned + # + # old_commit_oid - the sha1 of the commit that was previously analyzed + # old_stats - the result of the previous analysis, obtained by calling + # Repository#cache on the old repository + # + # Returns nothing + def load_existing_stats(old_commit_oid, old_stats) + @old_commit_oid = old_commit_oid + @old_stats = old_stats + nil end # Public: Returns a breakdown of language stats. # # Examples # - # # => { Language['Ruby'] => 46319, - # Language['JavaScript'] => 258 } + # # => { 'Ruby' => 46319, + # 'JavaScript' => 258 } # - # Returns a Hash of Language keys and Integer size values. + # Returns a Hash of language names and Integer size values. def languages - compute_stats - @sizes + @sizes ||= begin + sizes = Hash.new { 0 } + cache.each do |_, (language, size)| + sizes[language] += size + end + sizes + end end # Public: Get primary Language of repository. # - # Returns a Language + # Returns a language name def language - compute_stats - @language + @language ||= begin + primary = languages.max_by { |(_, size)| size } + primary && primary[0] + end end # Public: Get the total size of the repository. # # Returns a byte size Integer def size - compute_stats - @size + @size ||= languages.inject(0) { |s,(_,v)| s + v } end # Public: Return the language breakdown of this repository by file + # + # Returns a map of language names => [filenames...] def breakdown_by_file - compute_stats - @file_breakdown + @file_breakdown ||= begin + breakdown = Hash.new { |h,k| h[k] = Array.new } + cache.each do |filename, (language, _)| + breakdown[language] << filename + end + breakdown + end end - # Internal: Compute language breakdown for each blob in the Repository. + # Public: Return the cached results of the analysis # - # Returns nothing - def compute_stats - return if @computed_stats + # This is a per-file breakdown that can be passed to other instances + # of Linguist::Repository to perform incremental scans + # + # Returns a map of filename => [language, size] + def cache + @cache ||= begin + if @old_commit_oid == @commit_oid + @old_stats + else + compute_stats(@old_commit_oid, @commit_oid, @old_stats) + end + end + end - @enum.each do |blob| - # Skip files that are likely binary - next if blob.likely_binary? + protected + def compute_stats(old_commit_oid, commit_oid, cache = nil) + file_map = cache ? cache.dup : {} + old_tree = old_commit_oid && Rugged::Commit.lookup(repository, old_commit_oid).tree + new_tree = Rugged::Commit.lookup(repository, commit_oid).tree - # Skip vendored or generated blobs - next if blob.vendored? || blob.generated? || blob.language.nil? + diff = Rugged::Tree.diff(repository, old_tree, new_tree) - # Only include programming languages and acceptable markup languages - if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name) + diff.each_delta do |delta| + old = delta.old_file[:path] + new = delta.new_file[:path] - # Build up the per-file breakdown stats - @file_breakdown[blob.language.group.name] << blob.name + file_map.delete(old) + next if delta.binary - @sizes[blob.language.group] += blob.size + if [:added, :modified].include? delta.status + # Skip submodules + mode = delta.new_file[:mode] + next if (mode & 040000) != 0 + + blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, mode.to_s(8)) + + # Skip vendored or generated blobs + next if blob.vendored? || blob.generated? || blob.language.nil? + + # Only include programming languages and acceptable markup languages + if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name) + file_map[new] = [blob.language.group.name, blob.size] + end end end - # Compute total size - @size = @sizes.inject(0) { |s,(_,v)| s + v } - - # Get primary language - if primary = @sizes.max_by { |(_, size)| size } - @language = primary[0] - end - - @computed_stats = true - - nil + file_map end end end diff --git a/lib/linguist/samples.json b/lib/linguist/samples.json index 0dd99948..f92255de 100644 --- a/lib/linguist/samples.json +++ b/lib/linguist/samples.json @@ -45,6 +45,9 @@ "BlitzBasic": [ ".bb" ], + "BlitzMax": [ + ".bmx" + ], "Bluespec": [ ".bsv" ], @@ -786,8 +789,8 @@ "exception.zep.php" ] }, - "tokens_total": 644911, - "languages_total": 847, + "tokens_total": 644951, + "languages_total": 848, "tokens": { "ABAP": { "*/**": 1, @@ -4945,6 +4948,32 @@ "return_": 2, "First": 1 }, + "BlitzMax": { + "SuperStrict": 1, + "Framework": 1, + "Brl.StandardIO": 1, + "Type": 2, + "TMyType": 3, + "Field": 1, + "property": 1, + "int": 3, + "Function": 1, + "A": 1, + "(": 5, + "param": 1, + ")": 5, + "do": 1, + "nothing": 1, + "End": 2, + "Method": 1, + "Global": 1, + "my": 1, + "new": 1, + "Win32": 1, + "my.A": 2, + "my.B": 2, + "Linux": 1 + }, "Bluespec": { "package": 2, "TbTL": 1, @@ -69860,6 +69889,7 @@ "AutoHotkey": 3, "Awk": 544, "BlitzBasic": 2065, + "BlitzMax": 40, "Bluespec": 1298, "Brightscript": 579, "C": 59053, @@ -70058,6 +70088,7 @@ "AutoHotkey": 1, "Awk": 1, "BlitzBasic": 3, + "BlitzMax": 1, "Bluespec": 2, "Brightscript": 1, "C": 29, @@ -70241,5 +70272,5 @@ "fish": 3, "wisp": 1 }, - "md5": "627951bf1580561b8c69f27efcbe50ed" + "md5": "e76f3defc6dad5102799fa1038a6f957" } \ No newline at end of file diff --git a/lib/linguist/version.rb b/lib/linguist/version.rb index 6704b3fb..c77f7734 100644 --- a/lib/linguist/version.rb +++ b/lib/linguist/version.rb @@ -1,3 +1,3 @@ module Linguist - VERSION = "2.12.0" + VERSION = "3.0.0" end diff --git a/samples/BlitzMax/sample.bmx b/samples/BlitzMax/sample.bmx new file mode 100644 index 00000000..e57e5f58 --- /dev/null +++ b/samples/BlitzMax/sample.bmx @@ -0,0 +1,25 @@ +SuperStrict + +Framework Brl.StandardIO + +Type TMyType + Field property:int + + Function A:int(param:int) + 'do nothing + End Function + + Method B:int(param:int) + 'do nothing + End Method +End Type + + +Global my:TMyType = new TMyType +?Win32 + my.A() + my.B() +?Linux + my.B() + my.A() +? \ No newline at end of file diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index 0c1a07ff..8bbf0695 100644 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -1,6 +1,7 @@ require 'linguist/heuristics' require 'linguist/language' require 'linguist/samples' +require 'linguist/file_blob' require 'test/unit' @@ -35,7 +36,8 @@ class TestHeuristcs < Test::Unit::TestCase end def test_detect_still_works_if_nothing_matches - match = Language.detect("Hello.m", fixture("Objective-C/hello.m")) + blob = Linguist::FileBlob.new(File.join(samples_path, "Objective-C/hello.m")) + match = Language.detect(blob) assert_equal Language["Objective-C"], match end diff --git a/test/test_repository.rb b/test/test_repository.rb index 832489d3..f4c5ad70 100644 --- a/test/test_repository.rb +++ b/test/test_repository.rb @@ -3,22 +3,24 @@ require 'linguist/repository' require 'test/unit' class TestRepository < Test::Unit::TestCase - include Linguist - - def repo(base_path) - Repository.from_directory(base_path) + def rugged_repository + @rugged ||= Rugged::Repository.new(File.expand_path("../../.git", __FILE__)) end - def linguist_repo - repo(File.expand_path("../..", __FILE__)) + def master_oid + 'd40b4a33deba710e2f494db357c654fbe5d4b419' + end + + def linguist_repo(oid = master_oid) + Linguist::Repository.new(rugged_repository, oid) end def test_linguist_language - # assert_equal Language['Ruby'], linguist_repo.language + assert_equal 'Ruby', linguist_repo.language end def test_linguist_languages - # assert linguist_repo.languages[Language['Ruby']] > 10_000 + assert linguist_repo.languages['Ruby'] > 10_000 end def test_linguist_size @@ -31,7 +33,18 @@ class TestRepository < Test::Unit::TestCase assert linguist_repo.breakdown_by_file["Ruby"].include?("lib/linguist/language.rb") end - def test_binary_override - assert_equal repo(File.expand_path("../../samples/Nimrod", __FILE__)).language, Language["Nimrod"] + def test_incremental_stats + old_commit = '3d7364877d6794f6cc2a86b493e893968a597332' + old_repo = linguist_repo(old_commit) + + assert old_repo.languages['Ruby'] > 10_000 + assert old_repo.size > 30_000 + + new_repo = Linguist::Repository.incremental(rugged_repository, master_oid, old_commit, old_repo.cache) + + assert new_repo.languages['Ruby'] > old_repo.languages['Ruby'] + assert new_repo.size > old_repo.size + + assert_equal linguist_repo.cache, new_repo.cache end end