mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Merge pull request #3054 from Alhadis/srt
Add support for SubRip Text files and SRecode Templates
This commit is contained in:
		
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @@ -743,3 +743,6 @@ | |||||||
| [submodule "vendor/grammars/language-turing"] | [submodule "vendor/grammars/language-turing"] | ||||||
| 	path = vendor/grammars/language-turing | 	path = vendor/grammars/language-turing | ||||||
| 	url = https://github.com/Alhadis/language-turing | 	url = https://github.com/Alhadis/language-turing | ||||||
|  | [submodule "vendor/grammars/atom-language-srt"] | ||||||
|  | 	path = vendor/grammars/atom-language-srt | ||||||
|  | 	url = https://github.com/314eter/atom-language-srt | ||||||
|   | |||||||
| @@ -180,6 +180,8 @@ vendor/grammars/atom-language-clean: | |||||||
| - source.clean | - source.clean | ||||||
| vendor/grammars/atom-language-purescript/: | vendor/grammars/atom-language-purescript/: | ||||||
| - source.purescript | - source.purescript | ||||||
|  | vendor/grammars/atom-language-srt: | ||||||
|  | - text.srt | ||||||
| vendor/grammars/atom-language-stan/: | vendor/grammars/atom-language-stan/: | ||||||
| - source.stan | - source.stan | ||||||
| vendor/grammars/atom-salt: | vendor/grammars/atom-salt: | ||||||
|   | |||||||
| @@ -391,6 +391,12 @@ module Linguist | |||||||
|       end |       end | ||||||
|     end |     end | ||||||
|      |      | ||||||
|  |     disambiguate ".srt" do |data| | ||||||
|  |       if /^(\d{2}:\d{2}:\d{2},\d{3})\s*(-->)\s*(\d{2}:\d{2}:\d{2},\d{3})$/.match(data) | ||||||
|  |         Language["SubRip Text"] | ||||||
|  |       end | ||||||
|  |     end | ||||||
|  |      | ||||||
|     disambiguate ".t" do |data| |     disambiguate ".t" do |data| | ||||||
|       if /^\s*%|^\s*var\s+\w+\s*:\s*\w+/.match(data) |       if /^\s*%|^\s*var\s+\w+\s*:\s*\w+/.match(data) | ||||||
|         Language["Turing"] |         Language["Turing"] | ||||||
|   | |||||||
| @@ -3336,6 +3336,14 @@ SQLPL: | |||||||
|   - .sql |   - .sql | ||||||
|   - .db2 |   - .db2 | ||||||
|  |  | ||||||
|  | SRecode Template: | ||||||
|  |   type: markup | ||||||
|  |   color: "#348a34" | ||||||
|  |   tm_scope: source.lisp | ||||||
|  |   ace_mode: lisp | ||||||
|  |   extensions: | ||||||
|  |   - .srt | ||||||
|  |  | ||||||
| STON: | STON: | ||||||
|   type: data |   type: data | ||||||
|   group: Smalltalk |   group: Smalltalk | ||||||
| @@ -3585,6 +3593,13 @@ Stylus: | |||||||
|   tm_scope: source.stylus |   tm_scope: source.stylus | ||||||
|   ace_mode: stylus |   ace_mode: stylus | ||||||
|  |  | ||||||
|  | SubRip Text: | ||||||
|  |   type: data | ||||||
|  |   extensions: | ||||||
|  |   - .srt | ||||||
|  |   ace_mode: text | ||||||
|  |   tm_scope: text.srt | ||||||
|  |  | ||||||
| SuperCollider: | SuperCollider: | ||||||
|   type: programming |   type: programming | ||||||
|   color: "#46390b" |   color: "#46390b" | ||||||
|   | |||||||
							
								
								
									
										45
									
								
								samples/SRecode Template/linguist.srt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								samples/SRecode Template/linguist.srt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | |||||||
|  | ;;; linguist.srt --- Template for linguist-example-mode | ||||||
|  |  | ||||||
|  | ;; Not copyrighted whatsoever. | ||||||
|  | ;; | ||||||
|  | ;; GPL can bite my shiny metal ass. | ||||||
|  | ;; | ||||||
|  | ;; GitHub:   1 | ||||||
|  | ;; Stallman: 0 | ||||||
|  |  | ||||||
|  | set mode "default" | ||||||
|  |  | ||||||
|  | set comment_start ";" | ||||||
|  |  | ||||||
|  | set LICENSE "It's public domain, baby. This was written for the sole | ||||||
|  | purpose of the format's inclusion and recognition by GitHub Linguist. | ||||||
|  | This block of multiline text was added because every other .srt file | ||||||
|  | I could find was GPL-licensed and had long-winded copyright blobs in | ||||||
|  | the file's header. Also, check out my sick line-wrapping abilities." | ||||||
|  |  | ||||||
|  | set DOLLAR "$" | ||||||
|  |  | ||||||
|  | context file | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template license | ||||||
|  | ---- | ||||||
|  | {{LICENSE:srecode-comment-prefix}} | ||||||
|  | ---- | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template filecomment :file :user :time | ||||||
|  | ---- | ||||||
|  | {{comment_start}} {{FILENAME}} --- {{^}} | ||||||
|  | {{comment_prefix}} YUO WAN GPL? | ||||||
|  | {{comment_prefix}}  | ||||||
|  | {{comment_prefix}} Copyright (C) {{YEAR}} {{?AUTHOR}} | ||||||
|  | {{comment_prefix}} | ||||||
|  | {{comment_prefix}} TUO BAD | ||||||
|  | {{comment_prefix}} WE EXPAT PEOPLE | ||||||
|  | {{comment_prefix}} {{EXPLETIVE}} YOU! | ||||||
|  | {{>:copyright}} | ||||||
|  | {{comment_end}} | ||||||
|  | ---- | ||||||
|  |  | ||||||
|  | ;; end | ||||||
| @@ -0,0 +1,240 @@ | |||||||
|  | 1 | ||||||
|  | 00:00:01,250 --> 00:00:03,740 | ||||||
|  | Adding NCL language. | ||||||
|  |  | ||||||
|  | 2 | ||||||
|  | 00:00:04,600 --> 00:00:08,730 | ||||||
|  | Thanks for the pull request! Do you know if these files are NCL too? | ||||||
|  |  | ||||||
|  | 3 | ||||||
|  | 00:00:09,800 --> 00:00:13,700 | ||||||
|  | Those are poorly-named documentation files for NCL functions. | ||||||
|  |  | ||||||
|  | 4 | ||||||
|  | 00:00:14,560 --> 00:00:17,200 | ||||||
|  | - What's better? | ||||||
|  | - This is better. | ||||||
|  |  | ||||||
|  | 5 | ||||||
|  | 00:00:18,500 --> 00:00:23,000 | ||||||
|  | - Would it be correct to recognise these files as text? | ||||||
|  | - Yes. | ||||||
|  |  | ||||||
|  | 6 | ||||||
|  | 00:00:23,890 --> 00:00:30,000 | ||||||
|  | In that case, could you add "NCL" to the text entry in languages.yml too? | ||||||
|  |  | ||||||
|  | 7 | ||||||
|  | 00:00:30,540 --> 00:00:35,250 | ||||||
|  | I added the example to "Text" and updated the license in the grammar submodule. | ||||||
|  |  | ||||||
|  | 8 | ||||||
|  | 00:00:38,500 --> 00:00:42,360 | ||||||
|  | Cloning the submodule fails for me in local with this URL. | ||||||
|  |  | ||||||
|  | 9 | ||||||
|  | 00:00:42,360 --> 00:00:45,250 | ||||||
|  | Could you use Git or HTTPS...? | ||||||
|  |  | ||||||
|  | 10 | ||||||
|  | 00:00:46,810 --> 00:00:50,000 | ||||||
|  | I updated the grammar submodule link to HTTPS. | ||||||
|  |  | ||||||
|  | 11 | ||||||
|  | 00:00:51,100 --> 00:00:57,000 | ||||||
|  | It's still failing locally. I don't think you can just update the .gitmodules file. | ||||||
|  |  | ||||||
|  | 12 | ||||||
|  | 00:00:57,750 --> 00:01:03,000 | ||||||
|  | You'll probably have to remove the submodule and add it again to be sure. | ||||||
|  |  | ||||||
|  | 13 | ||||||
|  | 00:01:04,336 --> 00:01:11,800 | ||||||
|  | - I'll see first if it's not an issue on my side... | ||||||
|  | - I removed the submodule and added it back with HTTPS. | ||||||
|  |  | ||||||
|  | 14 | ||||||
|  | 00:01:13,670 --> 00:01:18,000 | ||||||
|  | I tested the detection of NCL files with 2000 samples. | ||||||
|  |  | ||||||
|  | 15 | ||||||
|  | 00:01:18,000 --> 00:01:25,000 | ||||||
|  | The Bayesian classifier doesn't seem to be very good at distinguishing text from NCL. | ||||||
|  |  | ||||||
|  | 16 | ||||||
|  | 00:01:25,000 --> 00:01:30,740 | ||||||
|  | We could try to improve it by adding more samples, or we can define a new heuristic rule. | ||||||
|  |  | ||||||
|  | 17 | ||||||
|  | 00:01:31,300 --> 00:01:36,200 | ||||||
|  | - Do you want me to send you the sample files? | ||||||
|  | - Yes, please do. | ||||||
|  |  | ||||||
|  | 18 | ||||||
|  | 00:01:37,500 --> 00:01:39,500 | ||||||
|  | In your inbox. | ||||||
|  |  | ||||||
|  | 19 | ||||||
|  | 00:01:41,285 --> 00:01:48,216 | ||||||
|  | - So if I manually go through these and sort out the errors, would that help? | ||||||
|  | - Not really. | ||||||
|  |  | ||||||
|  | 20 | ||||||
|  | 00:01:48,540 --> 00:01:55,145 | ||||||
|  | It's a matter of keywords so there's not much to do there except for adding new samples. | ||||||
|  |  | ||||||
|  | 21 | ||||||
|  | 00:01:55,447 --> 00:02:02,000 | ||||||
|  | If adding a few more samples doesn't improve things, we'll see how to define a new heuristic rule. | ||||||
|  |  | ||||||
|  | 22 | ||||||
|  | 00:02:04,740 --> 00:02:09,600 | ||||||
|  | - I added quite a few NCL samples. | ||||||
|  | - That's a bit over the top, isn't it? | ||||||
|  |  | ||||||
|  | 23 | ||||||
|  | 00:02:10,250 --> 00:02:16,000 | ||||||
|  | We currently can't add too many samples because of #2117. | ||||||
|  |  | ||||||
|  | 24 | ||||||
|  | 00:02:18,000 --> 00:02:20,830 | ||||||
|  | (sigh) I decreased the number of added samples. | ||||||
|  |  | ||||||
|  | 25 | ||||||
|  | 00:02:21,630 --> 00:02:25,300 | ||||||
|  | Could you test the detection results in local with the samples I gave you? | ||||||
|  |  | ||||||
|  | 26 | ||||||
|  | 00:02:26,000 --> 00:02:28,670 | ||||||
|  | - What is the command to run that test? | ||||||
|  | - Here... | ||||||
|  |  | ||||||
|  | 27 | ||||||
|  | 00:02:28,716 --> 00:02:38,650 | ||||||
|  | [Coding intensifies] | ||||||
|  |  | ||||||
|  | 28 | ||||||
|  | 00:02:38,650 --> 00:02:43,330 | ||||||
|  | It is getting hung up on a false detection of Frege in one of the Text samples. | ||||||
|  |  | ||||||
|  | 29 | ||||||
|  | 00:02:43,540 --> 00:02:46,115 | ||||||
|  | Do you have any suggestions for implementing a heuristic? | ||||||
|  |  | ||||||
|  | 30 | ||||||
|  | 00:02:47,640 --> 00:02:55,200 | ||||||
|  | #2441 should fix this. In the meantime, you can change this in "test_heuristics.rb" | ||||||
|  |  | ||||||
|  | 31 | ||||||
|  | 00:02:55,165 --> 00:02:57,240 | ||||||
|  | Why did you have to change this? | ||||||
|  |  | ||||||
|  | 32 | ||||||
|  | 00:02:57,777 --> 00:03:04,480 | ||||||
|  | - It doesn't work for me unless I do that. | ||||||
|  | - Hum, same for me. Arfon, does it work for you? | ||||||
|  |  | ||||||
|  | 33 | ||||||
|  | 00:03:04,920 --> 00:03:08,830 | ||||||
|  | Requiring linguist/language doesn't work for me either. | ||||||
|  |  | ||||||
|  | 34 | ||||||
|  | 00:03:09,300 --> 00:03:13,885 | ||||||
|  | We restructured some of the requires a while ago and I think this is just out-of-date code. | ||||||
|  |  | ||||||
|  | 35 | ||||||
|  | 00:03:14,065 --> 00:03:20,950 | ||||||
|  | From a large sample of known NCL files taken from Github, it's now predicting with about 98% accuracy. | ||||||
|  |  | ||||||
|  | 36 | ||||||
|  | 00:03:21,183 --> 00:03:28,000 | ||||||
|  | For a large sample of other files with the NCL extension, it is around 92%. | ||||||
|  |  | ||||||
|  | 37 | ||||||
|  | 00:03:27,880 --> 00:03:30,950 | ||||||
|  | From those, nearly all of the errors come from one GitHub repository, | ||||||
|  |  | ||||||
|  | 38 | ||||||
|  | 00:03:30,950 --> 00:03:34,160 | ||||||
|  | and they all contain the text strings, "The URL" and "The Title". | ||||||
|  |  | ||||||
|  | 39 | ||||||
|  | 00:03:35,660 --> 00:03:43,260 | ||||||
|  | - Do you mean 92% files correctly identified as text? | ||||||
|  | - Yes, it correctly identifies 92% as text. | ||||||
|  |  | ||||||
|  | 40 | ||||||
|  | 00:03:44,000 --> 00:03:46,150 | ||||||
|  | I'd really like to see this dramatically reduced. | ||||||
|  |  | ||||||
|  | 41 | ||||||
|  | 00:03:46,150 --> 00:03:51,150 | ||||||
|  | What happens if we reduce to around 5 NCL sample files? | ||||||
|  |  | ||||||
|  | 42 | ||||||
|  | 00:03:51,150 --> 00:03:52,600 | ||||||
|  | Does Linguist still do a reasonable job? | ||||||
|  |  | ||||||
|  | 43 | ||||||
|  | 00:03:53,470 --> 00:03:58,190 | ||||||
|  | I reduced it to 16 NCL samples and 8 text samples. | ||||||
|  |  | ||||||
|  | 44 | ||||||
|  | 00:03:58,190 --> 00:04:01,720 | ||||||
|  | It correctly classifies my whole set of known NCL files. | ||||||
|  |  | ||||||
|  | 45 | ||||||
|  | 00:04:01,870 --> 00:04:05,730 | ||||||
|  | I tried with 5 samples but could not get the same level of accuracy. | ||||||
|  |  | ||||||
|  | 46 | ||||||
|  | 00:04:06,670 --> 00:04:10,400 | ||||||
|  | It incorrectly classifies all of the NCL files in this GitHub repository. | ||||||
|  |  | ||||||
|  | 47 | ||||||
|  | 00:04:11,130 --> 00:04:14,660 | ||||||
|  | All of these files contain the text strings, "THE_URL:" and "THE_TITLE:". | ||||||
|  |  | ||||||
|  | 48 | ||||||
|  | 00:04:14,660 --> 00:04:19,500 | ||||||
|  | It did not misclassify any other text-files with the extension NCL. | ||||||
|  |  | ||||||
|  | 49 | ||||||
|  | 00:04:19,970 --> 00:04:25,188 | ||||||
|  | With 100% accuracy? Does that mean it that the results are better with less samples?? | ||||||
|  |  | ||||||
|  | 50 | ||||||
|  | 00:04:25,610 --> 00:04:31,190 | ||||||
|  | I also removed a sample text-file which should have been classified as an NCL file. | ||||||
|  |  | ||||||
|  | 51 | ||||||
|  | 00:04:31,000 --> 00:04:35,895 | ||||||
|  | I think that probably made most of the difference, although I didn't test it atomically. | ||||||
|  |  | ||||||
|  | 52 | ||||||
|  | 00:04:35,895 --> 00:04:38,370 | ||||||
|  | Okay, that makes more sense. | ||||||
|  |  | ||||||
|  | 53 | ||||||
|  | 00:04:39,515 --> 00:04:43,450 | ||||||
|  | I don't get the same results for the text files. Full results here. | ||||||
|  |  | ||||||
|  | 54 | ||||||
|  | 00:04:44,650 --> 00:04:50,000 | ||||||
|  | They all look correctly classified to me, except for the ones in Fanghuan's repository. | ||||||
|  |  | ||||||
|  | 55 | ||||||
|  | 00:04:50,000 --> 00:04:55,920 | ||||||
|  | I manually went through all of the ones where I didn't already know based on the filename or the repository owner. | ||||||
|  |  | ||||||
|  | 56 | ||||||
|  | 00:04:56,526 --> 00:05:00,000 | ||||||
|  | [Presses button] It now correctly classifies all of my test files. | ||||||
|  |  | ||||||
|  | 57 | ||||||
|  | 00:05:00,000 --> 00:05:05,970 | ||||||
|  | R. Pavlick, thanks for this. These changes will be live in the next release of Linguist. In the next couple of weeks. | ||||||
|  |  | ||||||
|  | 58 | ||||||
|  | 00:05:05,970 --> 00:05:07,450 | ||||||
|  | Great! Thanks. | ||||||
							
								
								
									
										1
									
								
								vendor/grammars/atom-language-srt
									
									
									
									
										vendored
									
									
										Submodule
									
								
							
							
								
								
								
								
								
							
						
						
									
										1
									
								
								vendor/grammars/atom-language-srt
									
									
									
									
										vendored
									
									
										Submodule
									
								
							 Submodule vendor/grammars/atom-language-srt added at 386b4b64ae
									
								
							
							
								
								
									
										25
									
								
								vendor/licenses/grammar/atom-language-srt.txt
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								vendor/licenses/grammar/atom-language-srt.txt
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | |||||||
|  | --- | ||||||
|  | type: grammar | ||||||
|  | name: atom-language-srt | ||||||
|  | license: mit | ||||||
|  | --- | ||||||
|  | Copyright (c) 2016 Pieter Goetschalckx | ||||||
|  |  | ||||||
|  | Permission is hereby granted, free of charge, to any person obtaining | ||||||
|  | a copy of this software and associated documentation files (the | ||||||
|  | "Software"), to deal in the Software without restriction, including | ||||||
|  | without limitation the rights to use, copy, modify, merge, publish, | ||||||
|  | distribute, sublicense, and/or sell copies of the Software, and to | ||||||
|  | permit persons to whom the Software is furnished to do so, subject to | ||||||
|  | the following conditions: | ||||||
|  |  | ||||||
|  | The above copyright notice and this permission notice shall be | ||||||
|  | included in all copies or substantial portions of the Software. | ||||||
|  |  | ||||||
|  | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||||||
|  | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||||||
|  | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||||||
|  | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | ||||||
|  | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | ||||||
|  | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | ||||||
|  | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||||||
		Reference in New Issue
	
	Block a user