mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Merge pull request #3054 from Alhadis/srt
Add support for SubRip Text files and SRecode Templates
This commit is contained in:
		
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @@ -743,3 +743,6 @@ | ||||
| [submodule "vendor/grammars/language-turing"] | ||||
| 	path = vendor/grammars/language-turing | ||||
| 	url = https://github.com/Alhadis/language-turing | ||||
| [submodule "vendor/grammars/atom-language-srt"] | ||||
| 	path = vendor/grammars/atom-language-srt | ||||
| 	url = https://github.com/314eter/atom-language-srt | ||||
|   | ||||
| @@ -180,6 +180,8 @@ vendor/grammars/atom-language-clean: | ||||
| - source.clean | ||||
| vendor/grammars/atom-language-purescript/: | ||||
| - source.purescript | ||||
| vendor/grammars/atom-language-srt: | ||||
| - text.srt | ||||
| vendor/grammars/atom-language-stan/: | ||||
| - source.stan | ||||
| vendor/grammars/atom-salt: | ||||
|   | ||||
| @@ -391,6 +391,12 @@ module Linguist | ||||
|       end | ||||
|     end | ||||
|      | ||||
|     disambiguate ".srt" do |data| | ||||
|       if /^(\d{2}:\d{2}:\d{2},\d{3})\s*(-->)\s*(\d{2}:\d{2}:\d{2},\d{3})$/.match(data) | ||||
|         Language["SubRip Text"] | ||||
|       end | ||||
|     end | ||||
|      | ||||
|     disambiguate ".t" do |data| | ||||
|       if /^\s*%|^\s*var\s+\w+\s*:\s*\w+/.match(data) | ||||
|         Language["Turing"] | ||||
|   | ||||
| @@ -3336,6 +3336,14 @@ SQLPL: | ||||
|   - .sql | ||||
|   - .db2 | ||||
|  | ||||
| SRecode Template: | ||||
|   type: markup | ||||
|   color: "#348a34" | ||||
|   tm_scope: source.lisp | ||||
|   ace_mode: lisp | ||||
|   extensions: | ||||
|   - .srt | ||||
|  | ||||
| STON: | ||||
|   type: data | ||||
|   group: Smalltalk | ||||
| @@ -3585,6 +3593,13 @@ Stylus: | ||||
|   tm_scope: source.stylus | ||||
|   ace_mode: stylus | ||||
|  | ||||
| SubRip Text: | ||||
|   type: data | ||||
|   extensions: | ||||
|   - .srt | ||||
|   ace_mode: text | ||||
|   tm_scope: text.srt | ||||
|  | ||||
| SuperCollider: | ||||
|   type: programming | ||||
|   color: "#46390b" | ||||
|   | ||||
							
								
								
									
										45
									
								
								samples/SRecode Template/linguist.srt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								samples/SRecode Template/linguist.srt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | ||||
| ;;; linguist.srt --- Template for linguist-example-mode | ||||
|  | ||||
| ;; Not copyrighted whatsoever. | ||||
| ;; | ||||
| ;; GPL can bite my shiny metal ass. | ||||
| ;; | ||||
| ;; GitHub:   1 | ||||
| ;; Stallman: 0 | ||||
|  | ||||
| set mode "default" | ||||
|  | ||||
| set comment_start ";" | ||||
|  | ||||
| set LICENSE "It's public domain, baby. This was written for the sole | ||||
| purpose of the format's inclusion and recognition by GitHub Linguist. | ||||
| This block of multiline text was added because every other .srt file | ||||
| I could find was GPL-licensed and had long-winded copyright blobs in | ||||
| the file's header. Also, check out my sick line-wrapping abilities." | ||||
|  | ||||
| set DOLLAR "$" | ||||
|  | ||||
| context file | ||||
|  | ||||
|  | ||||
| template license | ||||
| ---- | ||||
| {{LICENSE:srecode-comment-prefix}} | ||||
| ---- | ||||
|  | ||||
|  | ||||
| template filecomment :file :user :time | ||||
| ---- | ||||
| {{comment_start}} {{FILENAME}} --- {{^}} | ||||
| {{comment_prefix}} YUO WAN GPL? | ||||
| {{comment_prefix}}  | ||||
| {{comment_prefix}} Copyright (C) {{YEAR}} {{?AUTHOR}} | ||||
| {{comment_prefix}} | ||||
| {{comment_prefix}} TUO BAD | ||||
| {{comment_prefix}} WE EXPAT PEOPLE | ||||
| {{comment_prefix}} {{EXPLETIVE}} YOU! | ||||
| {{>:copyright}} | ||||
| {{comment_end}} | ||||
| ---- | ||||
|  | ||||
| ;; end | ||||
| @@ -0,0 +1,240 @@ | ||||
| 1 | ||||
| 00:00:01,250 --> 00:00:03,740 | ||||
| Adding NCL language. | ||||
|  | ||||
| 2 | ||||
| 00:00:04,600 --> 00:00:08,730 | ||||
| Thanks for the pull request! Do you know if these files are NCL too? | ||||
|  | ||||
| 3 | ||||
| 00:00:09,800 --> 00:00:13,700 | ||||
| Those are poorly-named documentation files for NCL functions. | ||||
|  | ||||
| 4 | ||||
| 00:00:14,560 --> 00:00:17,200 | ||||
| - What's better? | ||||
| - This is better. | ||||
|  | ||||
| 5 | ||||
| 00:00:18,500 --> 00:00:23,000 | ||||
| - Would it be correct to recognise these files as text? | ||||
| - Yes. | ||||
|  | ||||
| 6 | ||||
| 00:00:23,890 --> 00:00:30,000 | ||||
| In that case, could you add "NCL" to the text entry in languages.yml too? | ||||
|  | ||||
| 7 | ||||
| 00:00:30,540 --> 00:00:35,250 | ||||
| I added the example to "Text" and updated the license in the grammar submodule. | ||||
|  | ||||
| 8 | ||||
| 00:00:38,500 --> 00:00:42,360 | ||||
| Cloning the submodule fails for me in local with this URL. | ||||
|  | ||||
| 9 | ||||
| 00:00:42,360 --> 00:00:45,250 | ||||
| Could you use Git or HTTPS...? | ||||
|  | ||||
| 10 | ||||
| 00:00:46,810 --> 00:00:50,000 | ||||
| I updated the grammar submodule link to HTTPS. | ||||
|  | ||||
| 11 | ||||
| 00:00:51,100 --> 00:00:57,000 | ||||
| It's still failing locally. I don't think you can just update the .gitmodules file. | ||||
|  | ||||
| 12 | ||||
| 00:00:57,750 --> 00:01:03,000 | ||||
| You'll probably have to remove the submodule and add it again to be sure. | ||||
|  | ||||
| 13 | ||||
| 00:01:04,336 --> 00:01:11,800 | ||||
| - I'll see first if it's not an issue on my side... | ||||
| - I removed the submodule and added it back with HTTPS. | ||||
|  | ||||
| 14 | ||||
| 00:01:13,670 --> 00:01:18,000 | ||||
| I tested the detection of NCL files with 2000 samples. | ||||
|  | ||||
| 15 | ||||
| 00:01:18,000 --> 00:01:25,000 | ||||
| The Bayesian classifier doesn't seem to be very good at distinguishing text from NCL. | ||||
|  | ||||
| 16 | ||||
| 00:01:25,000 --> 00:01:30,740 | ||||
| We could try to improve it by adding more samples, or we can define a new heuristic rule. | ||||
|  | ||||
| 17 | ||||
| 00:01:31,300 --> 00:01:36,200 | ||||
| - Do you want me to send you the sample files? | ||||
| - Yes, please do. | ||||
|  | ||||
| 18 | ||||
| 00:01:37,500 --> 00:01:39,500 | ||||
| In your inbox. | ||||
|  | ||||
| 19 | ||||
| 00:01:41,285 --> 00:01:48,216 | ||||
| - So if I manually go through these and sort out the errors, would that help? | ||||
| - Not really. | ||||
|  | ||||
| 20 | ||||
| 00:01:48,540 --> 00:01:55,145 | ||||
| It's a matter of keywords so there's not much to do there except for adding new samples. | ||||
|  | ||||
| 21 | ||||
| 00:01:55,447 --> 00:02:02,000 | ||||
| If adding a few more samples doesn't improve things, we'll see how to define a new heuristic rule. | ||||
|  | ||||
| 22 | ||||
| 00:02:04,740 --> 00:02:09,600 | ||||
| - I added quite a few NCL samples. | ||||
| - That's a bit over the top, isn't it? | ||||
|  | ||||
| 23 | ||||
| 00:02:10,250 --> 00:02:16,000 | ||||
| We currently can't add too many samples because of #2117. | ||||
|  | ||||
| 24 | ||||
| 00:02:18,000 --> 00:02:20,830 | ||||
| (sigh) I decreased the number of added samples. | ||||
|  | ||||
| 25 | ||||
| 00:02:21,630 --> 00:02:25,300 | ||||
| Could you test the detection results in local with the samples I gave you? | ||||
|  | ||||
| 26 | ||||
| 00:02:26,000 --> 00:02:28,670 | ||||
| - What is the command to run that test? | ||||
| - Here... | ||||
|  | ||||
| 27 | ||||
| 00:02:28,716 --> 00:02:38,650 | ||||
| [Coding intensifies] | ||||
|  | ||||
| 28 | ||||
| 00:02:38,650 --> 00:02:43,330 | ||||
| It is getting hung up on a false detection of Frege in one of the Text samples. | ||||
|  | ||||
| 29 | ||||
| 00:02:43,540 --> 00:02:46,115 | ||||
| Do you have any suggestions for implementing a heuristic? | ||||
|  | ||||
| 30 | ||||
| 00:02:47,640 --> 00:02:55,200 | ||||
| #2441 should fix this. In the meantime, you can change this in "test_heuristics.rb" | ||||
|  | ||||
| 31 | ||||
| 00:02:55,165 --> 00:02:57,240 | ||||
| Why did you have to change this? | ||||
|  | ||||
| 32 | ||||
| 00:02:57,777 --> 00:03:04,480 | ||||
| - It doesn't work for me unless I do that. | ||||
| - Hum, same for me. Arfon, does it work for you? | ||||
|  | ||||
| 33 | ||||
| 00:03:04,920 --> 00:03:08,830 | ||||
| Requiring linguist/language doesn't work for me either. | ||||
|  | ||||
| 34 | ||||
| 00:03:09,300 --> 00:03:13,885 | ||||
| We restructured some of the requires a while ago and I think this is just out-of-date code. | ||||
|  | ||||
| 35 | ||||
| 00:03:14,065 --> 00:03:20,950 | ||||
| From a large sample of known NCL files taken from Github, it's now predicting with about 98% accuracy. | ||||
|  | ||||
| 36 | ||||
| 00:03:21,183 --> 00:03:28,000 | ||||
| For a large sample of other files with the NCL extension, it is around 92%. | ||||
|  | ||||
| 37 | ||||
| 00:03:27,880 --> 00:03:30,950 | ||||
| From those, nearly all of the errors come from one GitHub repository, | ||||
|  | ||||
| 38 | ||||
| 00:03:30,950 --> 00:03:34,160 | ||||
| and they all contain the text strings, "The URL" and "The Title". | ||||
|  | ||||
| 39 | ||||
| 00:03:35,660 --> 00:03:43,260 | ||||
| - Do you mean 92% files correctly identified as text? | ||||
| - Yes, it correctly identifies 92% as text. | ||||
|  | ||||
| 40 | ||||
| 00:03:44,000 --> 00:03:46,150 | ||||
| I'd really like to see this dramatically reduced. | ||||
|  | ||||
| 41 | ||||
| 00:03:46,150 --> 00:03:51,150 | ||||
| What happens if we reduce to around 5 NCL sample files? | ||||
|  | ||||
| 42 | ||||
| 00:03:51,150 --> 00:03:52,600 | ||||
| Does Linguist still do a reasonable job? | ||||
|  | ||||
| 43 | ||||
| 00:03:53,470 --> 00:03:58,190 | ||||
| I reduced it to 16 NCL samples and 8 text samples. | ||||
|  | ||||
| 44 | ||||
| 00:03:58,190 --> 00:04:01,720 | ||||
| It correctly classifies my whole set of known NCL files. | ||||
|  | ||||
| 45 | ||||
| 00:04:01,870 --> 00:04:05,730 | ||||
| I tried with 5 samples but could not get the same level of accuracy. | ||||
|  | ||||
| 46 | ||||
| 00:04:06,670 --> 00:04:10,400 | ||||
| It incorrectly classifies all of the NCL files in this GitHub repository. | ||||
|  | ||||
| 47 | ||||
| 00:04:11,130 --> 00:04:14,660 | ||||
| All of these files contain the text strings, "THE_URL:" and "THE_TITLE:". | ||||
|  | ||||
| 48 | ||||
| 00:04:14,660 --> 00:04:19,500 | ||||
| It did not misclassify any other text-files with the extension NCL. | ||||
|  | ||||
| 49 | ||||
| 00:04:19,970 --> 00:04:25,188 | ||||
| With 100% accuracy? Does that mean it that the results are better with less samples?? | ||||
|  | ||||
| 50 | ||||
| 00:04:25,610 --> 00:04:31,190 | ||||
| I also removed a sample text-file which should have been classified as an NCL file. | ||||
|  | ||||
| 51 | ||||
| 00:04:31,000 --> 00:04:35,895 | ||||
| I think that probably made most of the difference, although I didn't test it atomically. | ||||
|  | ||||
| 52 | ||||
| 00:04:35,895 --> 00:04:38,370 | ||||
| Okay, that makes more sense. | ||||
|  | ||||
| 53 | ||||
| 00:04:39,515 --> 00:04:43,450 | ||||
| I don't get the same results for the text files. Full results here. | ||||
|  | ||||
| 54 | ||||
| 00:04:44,650 --> 00:04:50,000 | ||||
| They all look correctly classified to me, except for the ones in Fanghuan's repository. | ||||
|  | ||||
| 55 | ||||
| 00:04:50,000 --> 00:04:55,920 | ||||
| I manually went through all of the ones where I didn't already know based on the filename or the repository owner. | ||||
|  | ||||
| 56 | ||||
| 00:04:56,526 --> 00:05:00,000 | ||||
| [Presses button] It now correctly classifies all of my test files. | ||||
|  | ||||
| 57 | ||||
| 00:05:00,000 --> 00:05:05,970 | ||||
| R. Pavlick, thanks for this. These changes will be live in the next release of Linguist. In the next couple of weeks. | ||||
|  | ||||
| 58 | ||||
| 00:05:05,970 --> 00:05:07,450 | ||||
| Great! Thanks. | ||||
							
								
								
									
										1
									
								
								vendor/grammars/atom-language-srt
									
									
									
									
										vendored
									
									
										Submodule
									
								
							
							
								
								
								
								
								
							
						
						
									
										1
									
								
								vendor/grammars/atom-language-srt
									
									
									
									
										vendored
									
									
										Submodule
									
								
							 Submodule vendor/grammars/atom-language-srt added at 386b4b64ae
									
								
							
							
								
								
									
										25
									
								
								vendor/licenses/grammar/atom-language-srt.txt
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								vendor/licenses/grammar/atom-language-srt.txt
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| --- | ||||
| type: grammar | ||||
| name: atom-language-srt | ||||
| license: mit | ||||
| --- | ||||
| Copyright (c) 2016 Pieter Goetschalckx | ||||
|  | ||||
| Permission is hereby granted, free of charge, to any person obtaining | ||||
| a copy of this software and associated documentation files (the | ||||
| "Software"), to deal in the Software without restriction, including | ||||
| without limitation the rights to use, copy, modify, merge, publish, | ||||
| distribute, sublicense, and/or sell copies of the Software, and to | ||||
| permit persons to whom the Software is furnished to do so, subject to | ||||
| the following conditions: | ||||
|  | ||||
| The above copyright notice and this permission notice shall be | ||||
| included in all copies or substantial portions of the Software. | ||||
|  | ||||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||||
| EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||||
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||||
| NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | ||||
| LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | ||||
| OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | ||||
| WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||||
		Reference in New Issue
	
	Block a user