Merge pull request #2097 from github/detect-all-markup

Detect all markup languages when computing language statistics
This commit is contained in:
Adam Roben
2015-02-13 16:43:41 -05:00
9 changed files with 151 additions and 18 deletions

View File

@@ -26,14 +26,14 @@ Linguist supports a number of different custom overrides strategies for language
### Using gitattributes
Add a `.gitattributes` file to your project and use standard git-style path matchers for the files you want to override to set `linguist-language` and `linguist-vendored`.
Add a `.gitattributes` file to your project and use standard git-style path matchers for the files you want to override to set `linguist-documentation`, `linguist-language`, and `linguist-vendored`.
```
$ cat .gitattributes
*.rb linguist-language=Java
```
Checking code you didn't write, such as JavaScript libraries, into your git repo is a common practice, but this often inflates your project's language stats and may even cause your project to be labeled as another language. By default, Linguist treats all of the paths defined in [lib/linguist/vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml) as vendored and therefore doesn't include them in the language statistics for a repository.
Checking code you didn't write, such as JavaScript libraries, into your git repo is a common practice, but this often inflates your project's language stats and may even cause your project to be labeled as another language. By default, Linguist treats all of the paths defined in [lib/linguist/vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml) as vendored and therefore doesn't include them in the language statistics for a repository. Vendored files are also hidden by default in diffs on github.com.
Use the `linguist-vendored` attribute to vendor or un-vendor paths.
@@ -43,6 +43,16 @@ special-vendored-path/* linguist-vendored
jquery.js linguist-vendored=false
```
Similar to vendored files, Linguist excludes documentation files from your project's language stats. (Unlike vendored files, documentation files are displayed in diffs on github.com.) [lib/linguist/documentation.yml](lib/linguist/documentation.yml) lists common documentation paths and excludes them from the language statistics for your repository.
Use the `linguist-documentation` attribute to mark or unmark paths as documentation.
```
$ cat .gitattributes
project-docs/* linguist-documentation
docs/formatter.rb linguist-documentation=false
```
### Using Emacs and Vim modelines
Alternatively, you can use Vim and Emacs style modelines to set the language for a single file. Modelines can be placed anywhere within a file and are respected when determining how to syntax-highlight a file on GitHub.com

View File

@@ -236,6 +236,21 @@ module Linguist
name =~ VendoredRegexp ? true : false
end
documentation_paths = YAML.load_file(File.expand_path("../documentation.yml", __FILE__))
DocumentationRegexp = Regexp.new(documentation_paths.join('|'))
# Public: Is the blob in a documentation directory?
#
# Documentation files are ignored by language statistics.
#
# See "documentation.yml" for a list of documentation conventions that match
# this pattern.
#
# Return true or false
def documentation?
name =~ DocumentationRegexp ? true : false
end
# Public: Get each line of data
#
# Requires Blob#data
@@ -317,5 +332,15 @@ module Linguist
def tm_scope
language && language.tm_scope
end
DETECTABLE_TYPES = [:programming, :markup].freeze
# Internal: Should this blob be included in repository language statistics?
def include_in_language_stats?
!vendored? &&
!documentation? &&
!generated? &&
language && DETECTABLE_TYPES.include?(language.type)
end
end
end

View File

@@ -0,0 +1,19 @@
# Documentation files and directories are excluded from language
# statistics.
#
# Lines in this file are Regexps that are matched against the file
# pathname.
#
# Please add additional test coverage to
# `test/test_blob.rb#test_documentation` if you make any changes.
## Documentation Conventions ##
- ^docs?/
- ^Documentation/
- (^|/)CONTRIBUTING(\.|$)
- (^|/)COPYING(\.|$)
- (^|/)INSTALL(\.|$)
- (^|/)LICEN[CS]E(\.|$)
- (^|/)README(\.|$)

View File

@@ -32,13 +32,6 @@ module Linguist
# Valid Languages types
TYPES = [:data, :markup, :programming, :prose]
# Names of non-programming languages that we will still detect
#
# Returns an array
def self.detectable_markup
["CSS", "Less", "Sass", "SCSS", "Stylus", "TeX"]
end
# Detect languages by a specific type
#
# type - A symbol that exists within TYPES

View File

@@ -439,6 +439,8 @@ COBOL:
ace_mode: cobol
CSS:
type: markup
tm_scope: source.css
ace_mode: css
color: "#563d7c"
extensions:
@@ -1173,6 +1175,7 @@ HTML:
type: markup
tm_scope: text.html.basic
ace_mode: html
color: "#e44b23"
aliases:
- xhtml
extensions:
@@ -2684,6 +2687,13 @@ STON:
tm_scope: source.smalltalk
ace_mode: text
SVG:
type: data
extensions:
- .svg
tm_scope: text.xml
ace_mode: xml
Sage:
type: programming
group: Python
@@ -3199,7 +3209,6 @@ XML:
- .srdf
- .stTheme
- .sublime-snippet
- .svg
- .targets
- .tmCommand
- .tmLanguage

View File

@@ -4,7 +4,7 @@ require 'rugged'
module Linguist
class LazyBlob
GIT_ATTR = ['linguist-language', 'linguist-vendored']
GIT_ATTR = ['linguist-documentation', 'linguist-language', 'linguist-vendored']
GIT_ATTR_OPTS = { :priority => [:index], :skip_system => true }
GIT_ATTR_FLAGS = Rugged::Repository::Attributes.parse_opts(GIT_ATTR_OPTS)
@@ -37,6 +37,14 @@ module Linguist
end
end
def documentation?
if attr = git_attributes['linguist-documentation']
boolean_attribute(attr)
else
super
end
end
def language
return @language if defined?(@language)

View File

@@ -156,13 +156,8 @@ module Linguist
blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, mode.to_s(8))
# Skip vendored or generated blobs
next if blob.vendored? || blob.generated? || blob.language.nil?
# Only include programming languages and acceptable markup languages
if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name)
file_map[new] = [blob.language.group.name, blob.size]
end
next unless blob.include_in_language_stats?
file_map[new] = [blob.language.group.name, blob.size]
end
end

View File

@@ -441,6 +441,43 @@ class TestBlob < Minitest::Test
assert sample_blob("subproject/activator.bat").vendored?
end
def test_documentation
assert_predicate fixture_blob("doc/foo.html"), :documentation?
assert_predicate fixture_blob("docs/foo.html"), :documentation?
refute_predicate fixture_blob("project/doc/foo.html"), :documentation?
refute_predicate fixture_blob("project/docs/foo.html"), :documentation?
assert_predicate fixture_blob("Documentation/foo.md"), :documentation?
refute_predicate fixture_blob("project/Documentation/foo.md"), :documentation?
assert_predicate fixture_blob("README"), :documentation?
assert_predicate fixture_blob("README.md"), :documentation?
assert_predicate fixture_blob("README.txt"), :documentation?
assert_predicate fixture_blob("foo/README"), :documentation?
assert_predicate fixture_blob("CONTRIBUTING"), :documentation?
assert_predicate fixture_blob("CONTRIBUTING.md"), :documentation?
assert_predicate fixture_blob("CONTRIBUTING.txt"), :documentation?
assert_predicate fixture_blob("foo/CONTRIBUTING"), :documentation?
assert_predicate fixture_blob("LICENSE"), :documentation?
assert_predicate fixture_blob("LICENCE.md"), :documentation?
assert_predicate fixture_blob("LICENSE.txt"), :documentation?
assert_predicate fixture_blob("foo/LICENSE"), :documentation?
assert_predicate fixture_blob("COPYING"), :documentation?
assert_predicate fixture_blob("COPYING.md"), :documentation?
assert_predicate fixture_blob("COPYING.txt"), :documentation?
assert_predicate fixture_blob("foo/COPYING"), :documentation?
assert_predicate fixture_blob("INSTALL"), :documentation?
assert_predicate fixture_blob("INSTALL.md"), :documentation?
assert_predicate fixture_blob("INSTALL.txt"), :documentation?
assert_predicate fixture_blob("foo/INSTALL"), :documentation?
refute_predicate fixture_blob("foo.md"), :documentation?
end
def test_language
Samples.each do |sample|
blob = sample_blob(sample[:path])
@@ -485,4 +522,29 @@ class TestBlob < Minitest::Test
refute blob.new(" ").empty?
refute blob.new("nope").empty?
end
def test_include_in_language_stats
vendored = sample_blob("bower_components/custom/custom.js")
assert_predicate vendored, :vendored?
refute_predicate vendored, :include_in_language_stats?
documentation = fixture_blob("README")
assert_predicate documentation, :documentation?
refute_predicate documentation, :include_in_language_stats?
generated = sample_blob("CSS/bootstrap.min.css")
assert_predicate generated, :generated?
refute_predicate generated, :include_in_language_stats?
data = sample_blob("Ant Build System/filenames/ant.xml")
assert_equal :data, data.language.type
refute_predicate data, :include_in_language_stats?
prose = sample_blob("Markdown/tender.md")
assert_equal :prose, prose.language.type
refute_predicate prose, :include_in_language_stats?
included = sample_blob("HTML/pages.html")
assert_predicate included, :include_in_language_stats?
end
end

View File

@@ -99,4 +99,16 @@ class TestRepository < Minitest::Test
# overridden .gitattributes
assert !override_unvendored.vendored?
end
def test_linguist_override_documentation?
attr_commit = "d4c8fb8a28e91f97a7e53428a365c0abbac36d3d"
repo = linguist_repo(attr_commit).read_index
readme = Linguist::LazyBlob.new(rugged_repository, attr_commit, "README.md")
arduino = Linguist::LazyBlob.new(rugged_repository, attr_commit, "samples/Arduino/hello.ino")
# overridden by .gitattributes
refute_predicate readme, :documentation?
assert_predicate arduino, :documentation?
end
end