Generate language_id (#3284)

* Generate language_id from language names

The language_id is generated from the SHA256 hash of the language's name

* Test the validity of language ids

All languages should have a positive 32bit integer as an id

* Update languages.yml header in set-language-ids
This commit is contained in:
Paul Chaignon
2016-11-29 16:50:44 +01:00
committed by Brandon Black
parent d46a529b6a
commit 0980e304b1
2 changed files with 19 additions and 15 deletions

View File

@@ -11,6 +11,8 @@ header = <<-EOF
# ace_mode - A String name of the Ace Mode used for highlighting whenever # ace_mode - A String name of the Ace Mode used for highlighting whenever
# a file is edited. This must match one of the filenames in http://git.io/3XO_Cg. # a file is edited. This must match one of the filenames in http://git.io/3XO_Cg.
# Use "text" if a mode does not exist. # Use "text" if a mode does not exist.
# codemirror_mode - A String name of the CodeMirror Mode used for highlighting whenever a file is edited.
# This must match a mode from https://git.io/vi9Fx
# wrap - Boolean wrap to enable line wrapping (default: false) # wrap - Boolean wrap to enable line wrapping (default: false)
# extensions - An Array of associated extensions (the first one is # extensions - An Array of associated extensions (the first one is
# considered the primary extension, the others should be # considered the primary extension, the others should be
@@ -20,9 +22,9 @@ header = <<-EOF
# search_term - Deprecated: Some languages may be indexed under a # search_term - Deprecated: Some languages may be indexed under a
# different alias. Avoid defining new exceptions. # different alias. Avoid defining new exceptions.
# language_id - Integer used as a language-name-independent indexed field so that we can rename # language_id - Integer used as a language-name-independent indexed field so that we can rename
# languages in Linguist without reindexing all the code on GitHub. Must not be # languages in Linguist without reindexing all the code on GitHub. Must not be
# changed for existing languages without the explicit permission of GitHub staff. # changed for existing languages without the explicit permission of GitHub staff.
# color - CSS hex color to represent the language. # color - CSS hex color to represent the language. Only used if type is "programming" or "prose".
# tm_scope - The TextMate scope that represents this programming # tm_scope - The TextMate scope that represents this programming
# language. This should match one of the scopes listed in # language. This should match one of the scopes listed in
# the grammars.yml file. Use "none" if there is no grammar # the grammars.yml file. Use "none" if there is no grammar
@@ -36,21 +38,23 @@ header = <<-EOF
# Please keep this list alphabetized. Capitalization comes before lowercase. # Please keep this list alphabetized. Capitalization comes before lowercase.
EOF EOF
require 'digest'
generated = true if ARGV[0] == "--force" generated = true if ARGV[0] == "--force"
update = true if ARGV[0] == "--update" update = true if ARGV[0] == "--update"
def generate_language_id(language)
Digest::SHA256.hexdigest(language).to_i(16) % (2**30 - 1)
end
if generated if generated
puts "You're regenerating all of the language_id attributes for all Linguist " puts "You're regenerating all of the language_id attributes for all Linguist "
puts "languages defined in languages.yml. This is almost certainly NOT what" puts "languages defined in languages.yml. This is almost certainly NOT what"
puts "you meant to do!" puts "you meant to do!"
language_index = 0
languages = YAML.load(File.read("lib/linguist/languages.yml")) languages = YAML.load(File.read("lib/linguist/languages.yml"))
languages.each do |name, vals| languages.each do |name, vals|
vals.merge!('language_id' => language_index) vals.merge!('language_id' => generate_language_id(name))
language_index += 1
end end
File.write("lib/linguist/languages.yml", header + YAML.dump(languages)) File.write("lib/linguist/languages.yml", header + YAML.dump(languages))
@@ -58,20 +62,12 @@ elsif update
puts "Adding new language_id attributes to languages.yml that don't have one set" puts "Adding new language_id attributes to languages.yml that don't have one set"
languages = YAML.load(File.read("lib/linguist/languages.yml")) languages = YAML.load(File.read("lib/linguist/languages.yml"))
# First grab the maximum language_id
language_ids = []
languages.each { |name, vals| language_ids << vals['language_id'] if vals.has_key?('language_id')}
max_language_id = language_ids.max
puts "Current maximum language_id is #{max_language_id}"
missing_count = 0 missing_count = 0
language_index = max_language_id
languages.each do |name, vals| languages.each do |name, vals|
unless vals.has_key?('language_id') unless vals.has_key?('language_id')
language_index += 1
missing_count += 1 missing_count += 1
vals.merge!('language_id' => language_index) vals.merge!('language_id' => generate_language_id(name))
end end
end end

View File

@@ -427,6 +427,14 @@ class TestLanguage < Minitest::Test
assert missing.empty?, message assert missing.empty?, message
end end
def test_all_languages_have_a_valid_id
invalid = Language.all.select { |language| language.language_id < 0 || language.language_id >= (2**31 - 1) }
message = "The following languages do not have a valid language_id. Please use script/set-language-ids --update as per the contribution guidelines.\n"
invalid.each { |language| message << "#{language.name}\n" }
assert invalid.empty?, message
end
def test_all_language_id_are_unique def test_all_language_id_are_unique
duplicates = Language.all.group_by{ |language| language.language_id }.select { |k, v| v.size > 1 }.map(&:first) duplicates = Language.all.group_by{ |language| language.language_id }.select { |k, v| v.size > 1 }.map(&:first)