mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Merge pull request #3054 from Alhadis/srt
Add support for SubRip Text files and SRecode Templates
This commit is contained in:
		
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							@@ -743,3 +743,6 @@
 | 
			
		||||
[submodule "vendor/grammars/language-turing"]
 | 
			
		||||
	path = vendor/grammars/language-turing
 | 
			
		||||
	url = https://github.com/Alhadis/language-turing
 | 
			
		||||
[submodule "vendor/grammars/atom-language-srt"]
 | 
			
		||||
	path = vendor/grammars/atom-language-srt
 | 
			
		||||
	url = https://github.com/314eter/atom-language-srt
 | 
			
		||||
 
 | 
			
		||||
@@ -180,6 +180,8 @@ vendor/grammars/atom-language-clean:
 | 
			
		||||
- source.clean
 | 
			
		||||
vendor/grammars/atom-language-purescript/:
 | 
			
		||||
- source.purescript
 | 
			
		||||
vendor/grammars/atom-language-srt:
 | 
			
		||||
- text.srt
 | 
			
		||||
vendor/grammars/atom-language-stan/:
 | 
			
		||||
- source.stan
 | 
			
		||||
vendor/grammars/atom-salt:
 | 
			
		||||
 
 | 
			
		||||
@@ -391,6 +391,12 @@ module Linguist
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
    
 | 
			
		||||
    disambiguate ".srt" do |data|
 | 
			
		||||
      if /^(\d{2}:\d{2}:\d{2},\d{3})\s*(-->)\s*(\d{2}:\d{2}:\d{2},\d{3})$/.match(data)
 | 
			
		||||
        Language["SubRip Text"]
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
    
 | 
			
		||||
    disambiguate ".t" do |data|
 | 
			
		||||
      if /^\s*%|^\s*var\s+\w+\s*:\s*\w+/.match(data)
 | 
			
		||||
        Language["Turing"]
 | 
			
		||||
 
 | 
			
		||||
@@ -3336,6 +3336,14 @@ SQLPL:
 | 
			
		||||
  - .sql
 | 
			
		||||
  - .db2
 | 
			
		||||
 | 
			
		||||
SRecode Template:
 | 
			
		||||
  type: markup
 | 
			
		||||
  color: "#348a34"
 | 
			
		||||
  tm_scope: source.lisp
 | 
			
		||||
  ace_mode: lisp
 | 
			
		||||
  extensions:
 | 
			
		||||
  - .srt
 | 
			
		||||
 | 
			
		||||
STON:
 | 
			
		||||
  type: data
 | 
			
		||||
  group: Smalltalk
 | 
			
		||||
@@ -3585,6 +3593,13 @@ Stylus:
 | 
			
		||||
  tm_scope: source.stylus
 | 
			
		||||
  ace_mode: stylus
 | 
			
		||||
 | 
			
		||||
SubRip Text:
 | 
			
		||||
  type: data
 | 
			
		||||
  extensions:
 | 
			
		||||
  - .srt
 | 
			
		||||
  ace_mode: text
 | 
			
		||||
  tm_scope: text.srt
 | 
			
		||||
 | 
			
		||||
SuperCollider:
 | 
			
		||||
  type: programming
 | 
			
		||||
  color: "#46390b"
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										45
									
								
								samples/SRecode Template/linguist.srt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								samples/SRecode Template/linguist.srt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,45 @@
 | 
			
		||||
;;; linguist.srt --- Template for linguist-example-mode
 | 
			
		||||
 | 
			
		||||
;; Not copyrighted whatsoever.
 | 
			
		||||
;;
 | 
			
		||||
;; GPL can bite my shiny metal ass.
 | 
			
		||||
;;
 | 
			
		||||
;; GitHub:   1
 | 
			
		||||
;; Stallman: 0
 | 
			
		||||
 | 
			
		||||
set mode "default"
 | 
			
		||||
 | 
			
		||||
set comment_start ";"
 | 
			
		||||
 | 
			
		||||
set LICENSE "It's public domain, baby. This was written for the sole
 | 
			
		||||
purpose of the format's inclusion and recognition by GitHub Linguist.
 | 
			
		||||
This block of multiline text was added because every other .srt file
 | 
			
		||||
I could find was GPL-licensed and had long-winded copyright blobs in
 | 
			
		||||
the file's header. Also, check out my sick line-wrapping abilities."
 | 
			
		||||
 | 
			
		||||
set DOLLAR "$"
 | 
			
		||||
 | 
			
		||||
context file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template license
 | 
			
		||||
----
 | 
			
		||||
{{LICENSE:srecode-comment-prefix}}
 | 
			
		||||
----
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template filecomment :file :user :time
 | 
			
		||||
----
 | 
			
		||||
{{comment_start}} {{FILENAME}} --- {{^}}
 | 
			
		||||
{{comment_prefix}} YUO WAN GPL?
 | 
			
		||||
{{comment_prefix}} 
 | 
			
		||||
{{comment_prefix}} Copyright (C) {{YEAR}} {{?AUTHOR}}
 | 
			
		||||
{{comment_prefix}}
 | 
			
		||||
{{comment_prefix}} TUO BAD
 | 
			
		||||
{{comment_prefix}} WE EXPAT PEOPLE
 | 
			
		||||
{{comment_prefix}} {{EXPLETIVE}} YOU!
 | 
			
		||||
{{>:copyright}}
 | 
			
		||||
{{comment_end}}
 | 
			
		||||
----
 | 
			
		||||
 | 
			
		||||
;; end
 | 
			
		||||
@@ -0,0 +1,240 @@
 | 
			
		||||
1
 | 
			
		||||
00:00:01,250 --> 00:00:03,740
 | 
			
		||||
Adding NCL language.
 | 
			
		||||
 | 
			
		||||
2
 | 
			
		||||
00:00:04,600 --> 00:00:08,730
 | 
			
		||||
Thanks for the pull request! Do you know if these files are NCL too?
 | 
			
		||||
 | 
			
		||||
3
 | 
			
		||||
00:00:09,800 --> 00:00:13,700
 | 
			
		||||
Those are poorly-named documentation files for NCL functions.
 | 
			
		||||
 | 
			
		||||
4
 | 
			
		||||
00:00:14,560 --> 00:00:17,200
 | 
			
		||||
- What's better?
 | 
			
		||||
- This is better.
 | 
			
		||||
 | 
			
		||||
5
 | 
			
		||||
00:00:18,500 --> 00:00:23,000
 | 
			
		||||
- Would it be correct to recognise these files as text?
 | 
			
		||||
- Yes.
 | 
			
		||||
 | 
			
		||||
6
 | 
			
		||||
00:00:23,890 --> 00:00:30,000
 | 
			
		||||
In that case, could you add "NCL" to the text entry in languages.yml too?
 | 
			
		||||
 | 
			
		||||
7
 | 
			
		||||
00:00:30,540 --> 00:00:35,250
 | 
			
		||||
I added the example to "Text" and updated the license in the grammar submodule.
 | 
			
		||||
 | 
			
		||||
8
 | 
			
		||||
00:00:38,500 --> 00:00:42,360
 | 
			
		||||
Cloning the submodule fails for me in local with this URL.
 | 
			
		||||
 | 
			
		||||
9
 | 
			
		||||
00:00:42,360 --> 00:00:45,250
 | 
			
		||||
Could you use Git or HTTPS...?
 | 
			
		||||
 | 
			
		||||
10
 | 
			
		||||
00:00:46,810 --> 00:00:50,000
 | 
			
		||||
I updated the grammar submodule link to HTTPS.
 | 
			
		||||
 | 
			
		||||
11
 | 
			
		||||
00:00:51,100 --> 00:00:57,000
 | 
			
		||||
It's still failing locally. I don't think you can just update the .gitmodules file.
 | 
			
		||||
 | 
			
		||||
12
 | 
			
		||||
00:00:57,750 --> 00:01:03,000
 | 
			
		||||
You'll probably have to remove the submodule and add it again to be sure.
 | 
			
		||||
 | 
			
		||||
13
 | 
			
		||||
00:01:04,336 --> 00:01:11,800
 | 
			
		||||
- I'll see first if it's not an issue on my side...
 | 
			
		||||
- I removed the submodule and added it back with HTTPS.
 | 
			
		||||
 | 
			
		||||
14
 | 
			
		||||
00:01:13,670 --> 00:01:18,000
 | 
			
		||||
I tested the detection of NCL files with 2000 samples.
 | 
			
		||||
 | 
			
		||||
15
 | 
			
		||||
00:01:18,000 --> 00:01:25,000
 | 
			
		||||
The Bayesian classifier doesn't seem to be very good at distinguishing text from NCL.
 | 
			
		||||
 | 
			
		||||
16
 | 
			
		||||
00:01:25,000 --> 00:01:30,740
 | 
			
		||||
We could try to improve it by adding more samples, or we can define a new heuristic rule.
 | 
			
		||||
 | 
			
		||||
17
 | 
			
		||||
00:01:31,300 --> 00:01:36,200
 | 
			
		||||
- Do you want me to send you the sample files?
 | 
			
		||||
- Yes, please do.
 | 
			
		||||
 | 
			
		||||
18
 | 
			
		||||
00:01:37,500 --> 00:01:39,500
 | 
			
		||||
In your inbox.
 | 
			
		||||
 | 
			
		||||
19
 | 
			
		||||
00:01:41,285 --> 00:01:48,216
 | 
			
		||||
- So if I manually go through these and sort out the errors, would that help?
 | 
			
		||||
- Not really.
 | 
			
		||||
 | 
			
		||||
20
 | 
			
		||||
00:01:48,540 --> 00:01:55,145
 | 
			
		||||
It's a matter of keywords so there's not much to do there except for adding new samples.
 | 
			
		||||
 | 
			
		||||
21
 | 
			
		||||
00:01:55,447 --> 00:02:02,000
 | 
			
		||||
If adding a few more samples doesn't improve things, we'll see how to define a new heuristic rule.
 | 
			
		||||
 | 
			
		||||
22
 | 
			
		||||
00:02:04,740 --> 00:02:09,600
 | 
			
		||||
- I added quite a few NCL samples.
 | 
			
		||||
- That's a bit over the top, isn't it?
 | 
			
		||||
 | 
			
		||||
23
 | 
			
		||||
00:02:10,250 --> 00:02:16,000
 | 
			
		||||
We currently can't add too many samples because of #2117.
 | 
			
		||||
 | 
			
		||||
24
 | 
			
		||||
00:02:18,000 --> 00:02:20,830
 | 
			
		||||
(sigh) I decreased the number of added samples.
 | 
			
		||||
 | 
			
		||||
25
 | 
			
		||||
00:02:21,630 --> 00:02:25,300
 | 
			
		||||
Could you test the detection results in local with the samples I gave you?
 | 
			
		||||
 | 
			
		||||
26
 | 
			
		||||
00:02:26,000 --> 00:02:28,670
 | 
			
		||||
- What is the command to run that test?
 | 
			
		||||
- Here...
 | 
			
		||||
 | 
			
		||||
27
 | 
			
		||||
00:02:28,716 --> 00:02:38,650
 | 
			
		||||
[Coding intensifies]
 | 
			
		||||
 | 
			
		||||
28
 | 
			
		||||
00:02:38,650 --> 00:02:43,330
 | 
			
		||||
It is getting hung up on a false detection of Frege in one of the Text samples.
 | 
			
		||||
 | 
			
		||||
29
 | 
			
		||||
00:02:43,540 --> 00:02:46,115
 | 
			
		||||
Do you have any suggestions for implementing a heuristic?
 | 
			
		||||
 | 
			
		||||
30
 | 
			
		||||
00:02:47,640 --> 00:02:55,200
 | 
			
		||||
#2441 should fix this. In the meantime, you can change this in "test_heuristics.rb"
 | 
			
		||||
 | 
			
		||||
31
 | 
			
		||||
00:02:55,165 --> 00:02:57,240
 | 
			
		||||
Why did you have to change this?
 | 
			
		||||
 | 
			
		||||
32
 | 
			
		||||
00:02:57,777 --> 00:03:04,480
 | 
			
		||||
- It doesn't work for me unless I do that.
 | 
			
		||||
- Hum, same for me. Arfon, does it work for you?
 | 
			
		||||
 | 
			
		||||
33
 | 
			
		||||
00:03:04,920 --> 00:03:08,830
 | 
			
		||||
Requiring linguist/language doesn't work for me either.
 | 
			
		||||
 | 
			
		||||
34
 | 
			
		||||
00:03:09,300 --> 00:03:13,885
 | 
			
		||||
We restructured some of the requires a while ago and I think this is just out-of-date code.
 | 
			
		||||
 | 
			
		||||
35
 | 
			
		||||
00:03:14,065 --> 00:03:20,950
 | 
			
		||||
From a large sample of known NCL files taken from Github, it's now predicting with about 98% accuracy.
 | 
			
		||||
 | 
			
		||||
36
 | 
			
		||||
00:03:21,183 --> 00:03:28,000
 | 
			
		||||
For a large sample of other files with the NCL extension, it is around 92%.
 | 
			
		||||
 | 
			
		||||
37
 | 
			
		||||
00:03:27,880 --> 00:03:30,950
 | 
			
		||||
From those, nearly all of the errors come from one GitHub repository,
 | 
			
		||||
 | 
			
		||||
38
 | 
			
		||||
00:03:30,950 --> 00:03:34,160
 | 
			
		||||
and they all contain the text strings, "The URL" and "The Title".
 | 
			
		||||
 | 
			
		||||
39
 | 
			
		||||
00:03:35,660 --> 00:03:43,260
 | 
			
		||||
- Do you mean 92% files correctly identified as text?
 | 
			
		||||
- Yes, it correctly identifies 92% as text.
 | 
			
		||||
 | 
			
		||||
40
 | 
			
		||||
00:03:44,000 --> 00:03:46,150
 | 
			
		||||
I'd really like to see this dramatically reduced.
 | 
			
		||||
 | 
			
		||||
41
 | 
			
		||||
00:03:46,150 --> 00:03:51,150
 | 
			
		||||
What happens if we reduce to around 5 NCL sample files?
 | 
			
		||||
 | 
			
		||||
42
 | 
			
		||||
00:03:51,150 --> 00:03:52,600
 | 
			
		||||
Does Linguist still do a reasonable job?
 | 
			
		||||
 | 
			
		||||
43
 | 
			
		||||
00:03:53,470 --> 00:03:58,190
 | 
			
		||||
I reduced it to 16 NCL samples and 8 text samples.
 | 
			
		||||
 | 
			
		||||
44
 | 
			
		||||
00:03:58,190 --> 00:04:01,720
 | 
			
		||||
It correctly classifies my whole set of known NCL files.
 | 
			
		||||
 | 
			
		||||
45
 | 
			
		||||
00:04:01,870 --> 00:04:05,730
 | 
			
		||||
I tried with 5 samples but could not get the same level of accuracy.
 | 
			
		||||
 | 
			
		||||
46
 | 
			
		||||
00:04:06,670 --> 00:04:10,400
 | 
			
		||||
It incorrectly classifies all of the NCL files in this GitHub repository.
 | 
			
		||||
 | 
			
		||||
47
 | 
			
		||||
00:04:11,130 --> 00:04:14,660
 | 
			
		||||
All of these files contain the text strings, "THE_URL:" and "THE_TITLE:".
 | 
			
		||||
 | 
			
		||||
48
 | 
			
		||||
00:04:14,660 --> 00:04:19,500
 | 
			
		||||
It did not misclassify any other text-files with the extension NCL.
 | 
			
		||||
 | 
			
		||||
49
 | 
			
		||||
00:04:19,970 --> 00:04:25,188
 | 
			
		||||
With 100% accuracy? Does that mean it that the results are better with less samples??
 | 
			
		||||
 | 
			
		||||
50
 | 
			
		||||
00:04:25,610 --> 00:04:31,190
 | 
			
		||||
I also removed a sample text-file which should have been classified as an NCL file.
 | 
			
		||||
 | 
			
		||||
51
 | 
			
		||||
00:04:31,000 --> 00:04:35,895
 | 
			
		||||
I think that probably made most of the difference, although I didn't test it atomically.
 | 
			
		||||
 | 
			
		||||
52
 | 
			
		||||
00:04:35,895 --> 00:04:38,370
 | 
			
		||||
Okay, that makes more sense.
 | 
			
		||||
 | 
			
		||||
53
 | 
			
		||||
00:04:39,515 --> 00:04:43,450
 | 
			
		||||
I don't get the same results for the text files. Full results here.
 | 
			
		||||
 | 
			
		||||
54
 | 
			
		||||
00:04:44,650 --> 00:04:50,000
 | 
			
		||||
They all look correctly classified to me, except for the ones in Fanghuan's repository.
 | 
			
		||||
 | 
			
		||||
55
 | 
			
		||||
00:04:50,000 --> 00:04:55,920
 | 
			
		||||
I manually went through all of the ones where I didn't already know based on the filename or the repository owner.
 | 
			
		||||
 | 
			
		||||
56
 | 
			
		||||
00:04:56,526 --> 00:05:00,000
 | 
			
		||||
[Presses button] It now correctly classifies all of my test files.
 | 
			
		||||
 | 
			
		||||
57
 | 
			
		||||
00:05:00,000 --> 00:05:05,970
 | 
			
		||||
R. Pavlick, thanks for this. These changes will be live in the next release of Linguist. In the next couple of weeks.
 | 
			
		||||
 | 
			
		||||
58
 | 
			
		||||
00:05:05,970 --> 00:05:07,450
 | 
			
		||||
Great! Thanks.
 | 
			
		||||
							
								
								
									
										1
									
								
								vendor/grammars/atom-language-srt
									
									
									
									
										vendored
									
									
										Submodule
									
								
							
							
								
								
								
								
								
							
						
						
									
										1
									
								
								vendor/grammars/atom-language-srt
									
									
									
									
										vendored
									
									
										Submodule
									
								
							 Submodule vendor/grammars/atom-language-srt added at 386b4b64ae
									
								
							
							
								
								
									
										25
									
								
								vendor/licenses/grammar/atom-language-srt.txt
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								vendor/licenses/grammar/atom-language-srt.txt
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,25 @@
 | 
			
		||||
---
 | 
			
		||||
type: grammar
 | 
			
		||||
name: atom-language-srt
 | 
			
		||||
license: mit
 | 
			
		||||
---
 | 
			
		||||
Copyright (c) 2016 Pieter Goetschalckx
 | 
			
		||||
 | 
			
		||||
Permission is hereby granted, free of charge, to any person obtaining
 | 
			
		||||
a copy of this software and associated documentation files (the
 | 
			
		||||
"Software"), to deal in the Software without restriction, including
 | 
			
		||||
without limitation the rights to use, copy, modify, merge, publish,
 | 
			
		||||
distribute, sublicense, and/or sell copies of the Software, and to
 | 
			
		||||
permit persons to whom the Software is furnished to do so, subject to
 | 
			
		||||
the following conditions:
 | 
			
		||||
 | 
			
		||||
The above copyright notice and this permission notice shall be
 | 
			
		||||
included in all copies or substantial portions of the Software.
 | 
			
		||||
 | 
			
		||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
			
		||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
			
		||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 | 
			
		||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 | 
			
		||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 | 
			
		||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 | 
			
		||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
			
		||||
		Reference in New Issue
	
	Block a user