From 066052ddd20e00ac7133761173898e5155f83255 Mon Sep 17 00:00:00 2001 From: Adam Roben Date: Thu, 12 Feb 2015 10:04:44 -0500 Subject: [PATCH] Exclude documentation files from language statistics Documentation is an important part of a software project but is not generally thought of as part of the code for that project. Repository language statistics are used to quantify the project's code, so it makes sense to exclude documentation from those computations. Documentation files are recognized similarly to vendored files. lib/linguist/documentation.yml contains regular expressions to match common names for documentation files. A new linguist-documentation Git attribute can be used to override those conventions. --- README.md | 12 +++++++++++- lib/linguist/blob_helper.rb | 15 +++++++++++++++ lib/linguist/documentation.yml | 18 ++++++++++++++++++ lib/linguist/lazy_blob.rb | 10 +++++++++- lib/linguist/repository.rb | 2 +- test/test_blob.rb | 32 ++++++++++++++++++++++++++++++++ test/test_repository.rb | 12 ++++++++++++ 7 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 lib/linguist/documentation.yml diff --git a/README.md b/README.md index 4a7ea415..6f8a3993 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Linguist supports a number of different custom overrides strategies for language ### Using gitattributes -Add a `.gitattributes` file to your project and use standard git-style path matchers for the files you want to override to set `linguist-language` and `linguist-vendored`. +Add a `.gitattributes` file to your project and use standard git-style path matchers for the files you want to override to set `linguist-documentation`, `linguist-language`, and `linguist-vendored`. ``` $ cat .gitattributes @@ -43,6 +43,16 @@ special-vendored-path/* linguist-vendored jquery.js linguist-vendored=false ``` +Similar to vendored files, Linguist excludes documentation files from your project's language stats. [lib/linguist/documentation.yml](lib/linguist/documentation.yml) lists common documentation paths and excludes them from the language statistics for your repository. + +Use the `linguist-documentation` attribute to mark or unmark paths as documentation. + +``` +$ cat .gitattributes +project-docs/* linguist-documentation +docs/formatter.rb linguist-documentation=false +``` + ### Using Emacs and Vim modelines Alternatively, you can use Vim and Emacs style modelines to set the language for a single file. Modelines can be placed anywhere within a file and are respected when determining how to syntax-highlight a file on GitHub.com diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index c368b4d0..ff11aefd 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -236,6 +236,21 @@ module Linguist name =~ VendoredRegexp ? true : false end + documentation_paths = YAML.load_file(File.expand_path("../documentation.yml", __FILE__)) + DocumentationRegexp = Regexp.new(documentation_paths.join('|')) + + # Public: Is the blob in a documentation directory? + # + # Documentation files are ignored by language statistics. + # + # See "documentation.yml" for a list of documentation conventions that match + # this pattern. + # + # Return true or false + def documentation? + name =~ DocumentationRegexp ? true : false + end + # Public: Get each line of data # # Requires Blob#data diff --git a/lib/linguist/documentation.yml b/lib/linguist/documentation.yml new file mode 100644 index 00000000..57fc6151 --- /dev/null +++ b/lib/linguist/documentation.yml @@ -0,0 +1,18 @@ +# Documentation files and directories are excluded from language +# statistics. +# +# Lines in this file are Regexps that are matched against the file +# pathname. +# +# Please add additional test coverage to +# `test/test_blob.rb#test_documentation` if you make any changes. + +## Documentation Conventions ## + +- ^docs?/ +- ^Documentation/ + +- (^|/)CONTRIBUTING(\.|$) +- (^|/)COPYING(\.|$) +- (^|/)LICEN[CS]E(\.|$) +- (^|/)README(\.|$) diff --git a/lib/linguist/lazy_blob.rb b/lib/linguist/lazy_blob.rb index 9691bca5..5465a71f 100644 --- a/lib/linguist/lazy_blob.rb +++ b/lib/linguist/lazy_blob.rb @@ -4,7 +4,7 @@ require 'rugged' module Linguist class LazyBlob - GIT_ATTR = ['linguist-language', 'linguist-vendored'] + GIT_ATTR = ['linguist-documentation', 'linguist-language', 'linguist-vendored'] GIT_ATTR_OPTS = { :priority => [:index], :skip_system => true } GIT_ATTR_FLAGS = Rugged::Repository::Attributes.parse_opts(GIT_ATTR_OPTS) @@ -37,6 +37,14 @@ module Linguist end end + def documentation? + if attr = git_attributes['linguist-documentation'] + boolean_attribute(attr) + else + super + end + end + def language return @language if defined?(@language) diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index 3c197fad..3837977f 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -159,7 +159,7 @@ module Linguist blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, mode.to_s(8)) # Skip vendored or generated blobs - next if blob.vendored? || blob.generated? || blob.language.nil? + next if blob.vendored? || blob.documentation? || blob.generated? || blob.language.nil? if DETECTABLE_TYPES.include?(blob.language.type) file_map[new] = [blob.language.group.name, blob.size] diff --git a/test/test_blob.rb b/test/test_blob.rb index ceb54bb3..fabd5a74 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -441,6 +441,38 @@ class TestBlob < Minitest::Test assert sample_blob("subproject/activator.bat").vendored? end + def test_documentation + assert_predicate fixture_blob("doc/foo.md"), :documentation? + assert_predicate fixture_blob("docs/foo.md"), :documentation? + refute_predicate fixture_blob("project/doc/foo.md"), :documentation? + refute_predicate fixture_blob("project/docs/foo.md"), :documentation? + + assert_predicate fixture_blob("Documentation/foo.md"), :documentation? + refute_predicate fixture_blob("project/Documentation/foo.md"), :documentation? + + assert_predicate fixture_blob("README"), :documentation? + assert_predicate fixture_blob("README.md"), :documentation? + assert_predicate fixture_blob("README.txt"), :documentation? + assert_predicate fixture_blob("foo/README"), :documentation? + + assert_predicate fixture_blob("CONTRIBUTING"), :documentation? + assert_predicate fixture_blob("CONTRIBUTING.md"), :documentation? + assert_predicate fixture_blob("CONTRIBUTING.txt"), :documentation? + assert_predicate fixture_blob("foo/CONTRIBUTING"), :documentation? + + assert_predicate fixture_blob("LICENSE"), :documentation? + assert_predicate fixture_blob("LICENCE.md"), :documentation? + assert_predicate fixture_blob("LICENSE.txt"), :documentation? + assert_predicate fixture_blob("foo/LICENSE"), :documentation? + + assert_predicate fixture_blob("COPYING"), :documentation? + assert_predicate fixture_blob("COPYING.md"), :documentation? + assert_predicate fixture_blob("COPYING.txt"), :documentation? + assert_predicate fixture_blob("foo/COPYING"), :documentation? + + refute_predicate fixture_blob("foo.md"), :documentation? + end + def test_language Samples.each do |sample| blob = sample_blob(sample[:path]) diff --git a/test/test_repository.rb b/test/test_repository.rb index b661668d..fcdd4f0c 100644 --- a/test/test_repository.rb +++ b/test/test_repository.rb @@ -99,4 +99,16 @@ class TestRepository < Minitest::Test # overridden .gitattributes assert !override_unvendored.vendored? end + + def test_linguist_override_documentation? + attr_commit = "d4c8fb8a28e91f97a7e53428a365c0abbac36d3d" + repo = linguist_repo(attr_commit).read_index + + readme = Linguist::LazyBlob.new(rugged_repository, attr_commit, "README.md") + arduino = Linguist::LazyBlob.new(rugged_repository, attr_commit, "samples/Arduino/hello.ino") + + # overridden by .gitattributes + refute_predicate readme, :documentation? + assert_predicate arduino, :documentation? + end end