diff --git a/.gitmodules b/.gitmodules index 0b861dfd..4e7e86d6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -743,3 +743,6 @@ [submodule "vendor/grammars/language-turing"] path = vendor/grammars/language-turing url = https://github.com/Alhadis/language-turing +[submodule "vendor/grammars/atom-language-srt"] + path = vendor/grammars/atom-language-srt + url = https://github.com/314eter/atom-language-srt diff --git a/grammars.yml b/grammars.yml index 7027e01d..d76e9546 100755 --- a/grammars.yml +++ b/grammars.yml @@ -180,6 +180,8 @@ vendor/grammars/atom-language-clean: - source.clean vendor/grammars/atom-language-purescript/: - source.purescript +vendor/grammars/atom-language-srt: +- text.srt vendor/grammars/atom-language-stan/: - source.stan vendor/grammars/atom-salt: diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index fc294192..45a7d9dc 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -391,6 +391,12 @@ module Linguist end end + disambiguate ".srt" do |data| + if /^(\d{2}:\d{2}:\d{2},\d{3})\s*(-->)\s*(\d{2}:\d{2}:\d{2},\d{3})$/.match(data) + Language["SubRip Text"] + end + end + disambiguate ".t" do |data| if /^\s*%|^\s*var\s+\w+\s*:\s*\w+/.match(data) Language["Turing"] diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index 581a6420..7b6b189f 100755 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -3336,6 +3336,14 @@ SQLPL: - .sql - .db2 +SRecode Template: + type: markup + color: "#348a34" + tm_scope: source.lisp + ace_mode: lisp + extensions: + - .srt + STON: type: data group: Smalltalk @@ -3585,6 +3593,13 @@ Stylus: tm_scope: source.stylus ace_mode: stylus +SubRip Text: + type: data + extensions: + - .srt + ace_mode: text + tm_scope: text.srt + SuperCollider: type: programming color: "#46390b" diff --git a/samples/SRecode Template/linguist.srt b/samples/SRecode Template/linguist.srt new file mode 100644 index 00000000..0be2b337 --- /dev/null +++ b/samples/SRecode Template/linguist.srt @@ -0,0 +1,45 @@ +;;; linguist.srt --- Template for linguist-example-mode + +;; Not copyrighted whatsoever. +;; +;; GPL can bite my shiny metal ass. +;; +;; GitHub: 1 +;; Stallman: 0 + +set mode "default" + +set comment_start ";" + +set LICENSE "It's public domain, baby. This was written for the sole +purpose of the format's inclusion and recognition by GitHub Linguist. +This block of multiline text was added because every other .srt file +I could find was GPL-licensed and had long-winded copyright blobs in +the file's header. Also, check out my sick line-wrapping abilities." + +set DOLLAR "$" + +context file + + +template license +---- +{{LICENSE:srecode-comment-prefix}} +---- + + +template filecomment :file :user :time +---- +{{comment_start}} {{FILENAME}} --- {{^}} +{{comment_prefix}} YUO WAN GPL? +{{comment_prefix}} +{{comment_prefix}} Copyright (C) {{YEAR}} {{?AUTHOR}} +{{comment_prefix}} +{{comment_prefix}} TUO BAD +{{comment_prefix}} WE EXPAT PEOPLE +{{comment_prefix}} {{EXPLETIVE}} YOU! +{{>:copyright}} +{{comment_end}} +---- + +;; end diff --git a/samples/SubRip Text/Adding.NCL.Language.S01E01.1080p.BluRay.x264.srt b/samples/SubRip Text/Adding.NCL.Language.S01E01.1080p.BluRay.x264.srt new file mode 100644 index 00000000..49198d7d --- /dev/null +++ b/samples/SubRip Text/Adding.NCL.Language.S01E01.1080p.BluRay.x264.srt @@ -0,0 +1,240 @@ +1 +00:00:01,250 --> 00:00:03,740 +Adding NCL language. + +2 +00:00:04,600 --> 00:00:08,730 +Thanks for the pull request! Do you know if these files are NCL too? + +3 +00:00:09,800 --> 00:00:13,700 +Those are poorly-named documentation files for NCL functions. + +4 +00:00:14,560 --> 00:00:17,200 +- What's better? +- This is better. + +5 +00:00:18,500 --> 00:00:23,000 +- Would it be correct to recognise these files as text? +- Yes. + +6 +00:00:23,890 --> 00:00:30,000 +In that case, could you add "NCL" to the text entry in languages.yml too? + +7 +00:00:30,540 --> 00:00:35,250 +I added the example to "Text" and updated the license in the grammar submodule. + +8 +00:00:38,500 --> 00:00:42,360 +Cloning the submodule fails for me in local with this URL. + +9 +00:00:42,360 --> 00:00:45,250 +Could you use Git or HTTPS...? + +10 +00:00:46,810 --> 00:00:50,000 +I updated the grammar submodule link to HTTPS. + +11 +00:00:51,100 --> 00:00:57,000 +It's still failing locally. I don't think you can just update the .gitmodules file. + +12 +00:00:57,750 --> 00:01:03,000 +You'll probably have to remove the submodule and add it again to be sure. + +13 +00:01:04,336 --> 00:01:11,800 +- I'll see first if it's not an issue on my side... +- I removed the submodule and added it back with HTTPS. + +14 +00:01:13,670 --> 00:01:18,000 +I tested the detection of NCL files with 2000 samples. + +15 +00:01:18,000 --> 00:01:25,000 +The Bayesian classifier doesn't seem to be very good at distinguishing text from NCL. + +16 +00:01:25,000 --> 00:01:30,740 +We could try to improve it by adding more samples, or we can define a new heuristic rule. + +17 +00:01:31,300 --> 00:01:36,200 +- Do you want me to send you the sample files? +- Yes, please do. + +18 +00:01:37,500 --> 00:01:39,500 +In your inbox. + +19 +00:01:41,285 --> 00:01:48,216 +- So if I manually go through these and sort out the errors, would that help? +- Not really. + +20 +00:01:48,540 --> 00:01:55,145 +It's a matter of keywords so there's not much to do there except for adding new samples. + +21 +00:01:55,447 --> 00:02:02,000 +If adding a few more samples doesn't improve things, we'll see how to define a new heuristic rule. + +22 +00:02:04,740 --> 00:02:09,600 +- I added quite a few NCL samples. +- That's a bit over the top, isn't it? + +23 +00:02:10,250 --> 00:02:16,000 +We currently can't add too many samples because of #2117. + +24 +00:02:18,000 --> 00:02:20,830 +(sigh) I decreased the number of added samples. + +25 +00:02:21,630 --> 00:02:25,300 +Could you test the detection results in local with the samples I gave you? + +26 +00:02:26,000 --> 00:02:28,670 +- What is the command to run that test? +- Here... + +27 +00:02:28,716 --> 00:02:38,650 +[Coding intensifies] + +28 +00:02:38,650 --> 00:02:43,330 +It is getting hung up on a false detection of Frege in one of the Text samples. + +29 +00:02:43,540 --> 00:02:46,115 +Do you have any suggestions for implementing a heuristic? + +30 +00:02:47,640 --> 00:02:55,200 +#2441 should fix this. In the meantime, you can change this in "test_heuristics.rb" + +31 +00:02:55,165 --> 00:02:57,240 +Why did you have to change this? + +32 +00:02:57,777 --> 00:03:04,480 +- It doesn't work for me unless I do that. +- Hum, same for me. Arfon, does it work for you? + +33 +00:03:04,920 --> 00:03:08,830 +Requiring linguist/language doesn't work for me either. + +34 +00:03:09,300 --> 00:03:13,885 +We restructured some of the requires a while ago and I think this is just out-of-date code. + +35 +00:03:14,065 --> 00:03:20,950 +From a large sample of known NCL files taken from Github, it's now predicting with about 98% accuracy. + +36 +00:03:21,183 --> 00:03:28,000 +For a large sample of other files with the NCL extension, it is around 92%. + +37 +00:03:27,880 --> 00:03:30,950 +From those, nearly all of the errors come from one GitHub repository, + +38 +00:03:30,950 --> 00:03:34,160 +and they all contain the text strings, "The URL" and "The Title". + +39 +00:03:35,660 --> 00:03:43,260 +- Do you mean 92% files correctly identified as text? +- Yes, it correctly identifies 92% as text. + +40 +00:03:44,000 --> 00:03:46,150 +I'd really like to see this dramatically reduced. + +41 +00:03:46,150 --> 00:03:51,150 +What happens if we reduce to around 5 NCL sample files? + +42 +00:03:51,150 --> 00:03:52,600 +Does Linguist still do a reasonable job? + +43 +00:03:53,470 --> 00:03:58,190 +I reduced it to 16 NCL samples and 8 text samples. + +44 +00:03:58,190 --> 00:04:01,720 +It correctly classifies my whole set of known NCL files. + +45 +00:04:01,870 --> 00:04:05,730 +I tried with 5 samples but could not get the same level of accuracy. + +46 +00:04:06,670 --> 00:04:10,400 +It incorrectly classifies all of the NCL files in this GitHub repository. + +47 +00:04:11,130 --> 00:04:14,660 +All of these files contain the text strings, "THE_URL:" and "THE_TITLE:". + +48 +00:04:14,660 --> 00:04:19,500 +It did not misclassify any other text-files with the extension NCL. + +49 +00:04:19,970 --> 00:04:25,188 +With 100% accuracy? Does that mean it that the results are better with less samples?? + +50 +00:04:25,610 --> 00:04:31,190 +I also removed a sample text-file which should have been classified as an NCL file. + +51 +00:04:31,000 --> 00:04:35,895 +I think that probably made most of the difference, although I didn't test it atomically. + +52 +00:04:35,895 --> 00:04:38,370 +Okay, that makes more sense. + +53 +00:04:39,515 --> 00:04:43,450 +I don't get the same results for the text files. Full results here. + +54 +00:04:44,650 --> 00:04:50,000 +They all look correctly classified to me, except for the ones in Fanghuan's repository. + +55 +00:04:50,000 --> 00:04:55,920 +I manually went through all of the ones where I didn't already know based on the filename or the repository owner. + +56 +00:04:56,526 --> 00:05:00,000 +[Presses button] It now correctly classifies all of my test files. + +57 +00:05:00,000 --> 00:05:05,970 +R. Pavlick, thanks for this. These changes will be live in the next release of Linguist. In the next couple of weeks. + +58 +00:05:05,970 --> 00:05:07,450 +Great! Thanks. diff --git a/vendor/grammars/atom-language-srt b/vendor/grammars/atom-language-srt new file mode 160000 index 00000000..386b4b64 --- /dev/null +++ b/vendor/grammars/atom-language-srt @@ -0,0 +1 @@ +Subproject commit 386b4b64aee092200753fe12f6caadddc844271b diff --git a/vendor/licenses/grammar/atom-language-srt.txt b/vendor/licenses/grammar/atom-language-srt.txt new file mode 100644 index 00000000..9a4c73e0 --- /dev/null +++ b/vendor/licenses/grammar/atom-language-srt.txt @@ -0,0 +1,25 @@ +--- +type: grammar +name: atom-language-srt +license: mit +--- +Copyright (c) 2016 Pieter Goetschalckx + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.