diff --git a/README.md b/README.md index 4a7ea415..6a2447a9 100644 --- a/README.md +++ b/README.md @@ -26,14 +26,14 @@ Linguist supports a number of different custom overrides strategies for language ### Using gitattributes -Add a `.gitattributes` file to your project and use standard git-style path matchers for the files you want to override to set `linguist-language` and `linguist-vendored`. +Add a `.gitattributes` file to your project and use standard git-style path matchers for the files you want to override to set `linguist-documentation`, `linguist-language`, and `linguist-vendored`. ``` $ cat .gitattributes *.rb linguist-language=Java ``` -Checking code you didn't write, such as JavaScript libraries, into your git repo is a common practice, but this often inflates your project's language stats and may even cause your project to be labeled as another language. By default, Linguist treats all of the paths defined in [lib/linguist/vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml) as vendored and therefore doesn't include them in the language statistics for a repository. +Checking code you didn't write, such as JavaScript libraries, into your git repo is a common practice, but this often inflates your project's language stats and may even cause your project to be labeled as another language. By default, Linguist treats all of the paths defined in [lib/linguist/vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml) as vendored and therefore doesn't include them in the language statistics for a repository. Vendored files are also hidden by default in diffs on github.com. Use the `linguist-vendored` attribute to vendor or un-vendor paths. @@ -43,6 +43,16 @@ special-vendored-path/* linguist-vendored jquery.js linguist-vendored=false ``` +Similar to vendored files, Linguist excludes documentation files from your project's language stats. (Unlike vendored files, documentation files are displayed in diffs on github.com.) [lib/linguist/documentation.yml](lib/linguist/documentation.yml) lists common documentation paths and excludes them from the language statistics for your repository. + +Use the `linguist-documentation` attribute to mark or unmark paths as documentation. + +``` +$ cat .gitattributes +project-docs/* linguist-documentation +docs/formatter.rb linguist-documentation=false +``` + ### Using Emacs and Vim modelines Alternatively, you can use Vim and Emacs style modelines to set the language for a single file. Modelines can be placed anywhere within a file and are respected when determining how to syntax-highlight a file on GitHub.com diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index c368b4d0..56c15f02 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -236,6 +236,21 @@ module Linguist name =~ VendoredRegexp ? true : false end + documentation_paths = YAML.load_file(File.expand_path("../documentation.yml", __FILE__)) + DocumentationRegexp = Regexp.new(documentation_paths.join('|')) + + # Public: Is the blob in a documentation directory? + # + # Documentation files are ignored by language statistics. + # + # See "documentation.yml" for a list of documentation conventions that match + # this pattern. + # + # Return true or false + def documentation? + name =~ DocumentationRegexp ? true : false + end + # Public: Get each line of data # # Requires Blob#data @@ -317,5 +332,15 @@ module Linguist def tm_scope language && language.tm_scope end + + DETECTABLE_TYPES = [:programming, :markup].freeze + + # Internal: Should this blob be included in repository language statistics? + def include_in_language_stats? + !vendored? && + !documentation? && + !generated? && + language && DETECTABLE_TYPES.include?(language.type) + end end end diff --git a/lib/linguist/documentation.yml b/lib/linguist/documentation.yml new file mode 100644 index 00000000..b884cd35 --- /dev/null +++ b/lib/linguist/documentation.yml @@ -0,0 +1,19 @@ +# Documentation files and directories are excluded from language +# statistics. +# +# Lines in this file are Regexps that are matched against the file +# pathname. +# +# Please add additional test coverage to +# `test/test_blob.rb#test_documentation` if you make any changes. + +## Documentation Conventions ## + +- ^docs?/ +- ^Documentation/ + +- (^|/)CONTRIBUTING(\.|$) +- (^|/)COPYING(\.|$) +- (^|/)INSTALL(\.|$) +- (^|/)LICEN[CS]E(\.|$) +- (^|/)README(\.|$) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 2b6e597f..2490a9f6 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -32,13 +32,6 @@ module Linguist # Valid Languages types TYPES = [:data, :markup, :programming, :prose] - # Names of non-programming languages that we will still detect - # - # Returns an array - def self.detectable_markup - ["CSS", "Less", "Sass", "SCSS", "Stylus", "TeX"] - end - # Detect languages by a specific type # # type - A symbol that exists within TYPES diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index 33a22975..9d8ee228 100644 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -439,6 +439,8 @@ COBOL: ace_mode: cobol CSS: + type: markup + tm_scope: source.css ace_mode: css color: "#563d7c" extensions: @@ -1173,6 +1175,7 @@ HTML: type: markup tm_scope: text.html.basic ace_mode: html + color: "#e44b23" aliases: - xhtml extensions: @@ -2684,6 +2687,13 @@ STON: tm_scope: source.smalltalk ace_mode: text +SVG: + type: data + extensions: + - .svg + tm_scope: text.xml + ace_mode: xml + Sage: type: programming group: Python @@ -3199,7 +3209,6 @@ XML: - .srdf - .stTheme - .sublime-snippet - - .svg - .targets - .tmCommand - .tmLanguage diff --git a/lib/linguist/lazy_blob.rb b/lib/linguist/lazy_blob.rb index 9691bca5..5465a71f 100644 --- a/lib/linguist/lazy_blob.rb +++ b/lib/linguist/lazy_blob.rb @@ -4,7 +4,7 @@ require 'rugged' module Linguist class LazyBlob - GIT_ATTR = ['linguist-language', 'linguist-vendored'] + GIT_ATTR = ['linguist-documentation', 'linguist-language', 'linguist-vendored'] GIT_ATTR_OPTS = { :priority => [:index], :skip_system => true } GIT_ATTR_FLAGS = Rugged::Repository::Attributes.parse_opts(GIT_ATTR_OPTS) @@ -37,6 +37,14 @@ module Linguist end end + def documentation? + if attr = git_attributes['linguist-documentation'] + boolean_attribute(attr) + else + super + end + end + def language return @language if defined?(@language) diff --git a/lib/linguist/repository.rb b/lib/linguist/repository.rb index 41e829c5..895a3754 100644 --- a/lib/linguist/repository.rb +++ b/lib/linguist/repository.rb @@ -156,13 +156,8 @@ module Linguist blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, mode.to_s(8)) - # Skip vendored or generated blobs - next if blob.vendored? || blob.generated? || blob.language.nil? - - # Only include programming languages and acceptable markup languages - if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name) - file_map[new] = [blob.language.group.name, blob.size] - end + next unless blob.include_in_language_stats? + file_map[new] = [blob.language.group.name, blob.size] end end diff --git a/test/test_blob.rb b/test/test_blob.rb index ceb54bb3..d59e6794 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -441,6 +441,43 @@ class TestBlob < Minitest::Test assert sample_blob("subproject/activator.bat").vendored? end + def test_documentation + assert_predicate fixture_blob("doc/foo.html"), :documentation? + assert_predicate fixture_blob("docs/foo.html"), :documentation? + refute_predicate fixture_blob("project/doc/foo.html"), :documentation? + refute_predicate fixture_blob("project/docs/foo.html"), :documentation? + + assert_predicate fixture_blob("Documentation/foo.md"), :documentation? + refute_predicate fixture_blob("project/Documentation/foo.md"), :documentation? + + assert_predicate fixture_blob("README"), :documentation? + assert_predicate fixture_blob("README.md"), :documentation? + assert_predicate fixture_blob("README.txt"), :documentation? + assert_predicate fixture_blob("foo/README"), :documentation? + + assert_predicate fixture_blob("CONTRIBUTING"), :documentation? + assert_predicate fixture_blob("CONTRIBUTING.md"), :documentation? + assert_predicate fixture_blob("CONTRIBUTING.txt"), :documentation? + assert_predicate fixture_blob("foo/CONTRIBUTING"), :documentation? + + assert_predicate fixture_blob("LICENSE"), :documentation? + assert_predicate fixture_blob("LICENCE.md"), :documentation? + assert_predicate fixture_blob("LICENSE.txt"), :documentation? + assert_predicate fixture_blob("foo/LICENSE"), :documentation? + + assert_predicate fixture_blob("COPYING"), :documentation? + assert_predicate fixture_blob("COPYING.md"), :documentation? + assert_predicate fixture_blob("COPYING.txt"), :documentation? + assert_predicate fixture_blob("foo/COPYING"), :documentation? + + assert_predicate fixture_blob("INSTALL"), :documentation? + assert_predicate fixture_blob("INSTALL.md"), :documentation? + assert_predicate fixture_blob("INSTALL.txt"), :documentation? + assert_predicate fixture_blob("foo/INSTALL"), :documentation? + + refute_predicate fixture_blob("foo.md"), :documentation? + end + def test_language Samples.each do |sample| blob = sample_blob(sample[:path]) @@ -485,4 +522,29 @@ class TestBlob < Minitest::Test refute blob.new(" ").empty? refute blob.new("nope").empty? end + + def test_include_in_language_stats + vendored = sample_blob("bower_components/custom/custom.js") + assert_predicate vendored, :vendored? + refute_predicate vendored, :include_in_language_stats? + + documentation = fixture_blob("README") + assert_predicate documentation, :documentation? + refute_predicate documentation, :include_in_language_stats? + + generated = sample_blob("CSS/bootstrap.min.css") + assert_predicate generated, :generated? + refute_predicate generated, :include_in_language_stats? + + data = sample_blob("Ant Build System/filenames/ant.xml") + assert_equal :data, data.language.type + refute_predicate data, :include_in_language_stats? + + prose = sample_blob("Markdown/tender.md") + assert_equal :prose, prose.language.type + refute_predicate prose, :include_in_language_stats? + + included = sample_blob("HTML/pages.html") + assert_predicate included, :include_in_language_stats? + end end diff --git a/test/test_repository.rb b/test/test_repository.rb index b661668d..fcdd4f0c 100644 --- a/test/test_repository.rb +++ b/test/test_repository.rb @@ -99,4 +99,16 @@ class TestRepository < Minitest::Test # overridden .gitattributes assert !override_unvendored.vendored? end + + def test_linguist_override_documentation? + attr_commit = "d4c8fb8a28e91f97a7e53428a365c0abbac36d3d" + repo = linguist_repo(attr_commit).read_index + + readme = Linguist::LazyBlob.new(rugged_repository, attr_commit, "README.md") + arduino = Linguist::LazyBlob.new(rugged_repository, attr_commit, "samples/Arduino/hello.ino") + + # overridden by .gitattributes + refute_predicate readme, :documentation? + assert_predicate arduino, :documentation? + end end