From 1bbcfa5683c203eb92a0d9571f63043791bea695 Mon Sep 17 00:00:00 2001 From: bruno cuconato Date: Wed, 21 Feb 2018 12:27:32 -0300 Subject: [PATCH] add CoNLL-U format (#4029) * * add CoNLL-U format - add to languages.yml - add textmate grammar - add to vendor/README - add to grammars.yml - add samples * rm other extensions as I couldn't find properly licensed examples of them in the wild * substitutesamples for something with appropriate license * update grammar submodule so it finds the LICENSE * add license to grammar * * conllu - readd other extensions - abridge samples and a new one - update grammar submodule: correct extension of grammar file * rm .conllx extension --- .gitmodules | 3 + grammars.yml | 2 + lib/linguist/languages.yml | 15 +- samples/CoNLL-U/CF1.conllu | 159 ++++++++++++++ samples/CoNLL-U/en-ud-test-abridged.conllu | 122 +++++++++++ samples/CoNLL-U/ug-ud-test-abridged.conllu | 121 ++++++++++ vendor/README.md | 1 + vendor/grammars/conllu-linguist-grammar | 1 + .../grammar/conllu-linguist-grammar.txt | 206 ++++++++++++++++++ 9 files changed, 628 insertions(+), 2 deletions(-) create mode 100644 samples/CoNLL-U/CF1.conllu create mode 100644 samples/CoNLL-U/en-ud-test-abridged.conllu create mode 100644 samples/CoNLL-U/ug-ud-test-abridged.conllu create mode 160000 vendor/grammars/conllu-linguist-grammar create mode 100644 vendor/licenses/grammar/conllu-linguist-grammar.txt diff --git a/.gitmodules b/.gitmodules index b7211d57..ceec2493 100644 --- a/.gitmodules +++ b/.gitmodules @@ -898,3 +898,6 @@ [submodule "vendor/grammars/atom-language-nextflow"] path = vendor/grammars/atom-language-nextflow url = https://github.com/nextflow-io/atom-language-nextflow +[submodule "vendor/grammars/conllu-linguist-grammar"] + path = vendor/grammars/conllu-linguist-grammar + url = https://github.com/odanoburu/conllu-linguist-grammar diff --git a/grammars.yml b/grammars.yml index 63c6005c..8502f043 100755 --- a/grammars.yml +++ b/grammars.yml @@ -248,6 +248,8 @@ vendor/grammars/chapel-tmbundle: vendor/grammars/cmake.tmbundle: - source.cache.cmake - source.cmake +vendor/grammars/conllu-linguist-grammar: +- text.conllu vendor/grammars/cool-tmbundle: - source.cool vendor/grammars/cpp-qt.tmbundle: diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index 0e4888be..2e73f348 100755 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -730,6 +730,17 @@ Closure Templates: - ".soy" tm_scope: text.html.soy language_id: 357046146 +CoNLL-U: + type: data + extensions: + - ".conllu" + - ".conll" + tm_scope: text.conllu + ace_mode: text + aliases: + - CoNLL + - CoNLL-X + language_id: 421026389 CoffeeScript: type: programming tm_scope: source.coffee @@ -2907,7 +2918,7 @@ Nextflow: extensions: - ".nf" filenames: - - "nextflow.config" + - nextflow.config interpreters: - nextflow language_id: 506780613 @@ -4190,7 +4201,7 @@ Scala: color: "#c22d40" extensions: - ".scala" - - ".kojo" + - ".kojo" - ".sbt" - ".sc" interpreters: diff --git a/samples/CoNLL-U/CF1.conllu b/samples/CoNLL-U/CF1.conllu new file mode 100644 index 00000000..3d9f0f02 --- /dev/null +++ b/samples/CoNLL-U/CF1.conllu @@ -0,0 +1,159 @@ +# text = PT no governo +# source = CETENFolha n=1 cad=Opinião sec=opi sem=94a +# sent_id = CF1-1 +# id = 1 +1 PT PT PROPN PROP|M|S|@NPHR Gender=Masc|Number=Sing 0 root _ _ +2-3 no _ _ _ _ _ _ _ _ +2 em em ADP |PRP|@N< _ 4 case _ _ +3 o o DET <-sam>||ART|M|S|@>N Definite=Def|Gender=Masc|Number=Sing|PronType=Art 4 det _ _ +4 governo governo NOUN |N|M|S|@P< Gender=Masc|Number=Sing 1 nmod _ _ + +# text = BRASÍLIA Pesquisa Datafolha publicada hoje revela um dado supreendente: recusando uma postura radical, a esmagadora maioria (77%) dos eleitores quer o PT participando do Governo Fernando Henrique Cardoso. +# source = CETENFolha n=1 cad=Opinião sec=opi sem=94a &W +# sent_id = CF1-3 +# id = 2 +1 BRASÍLIA Brasília PROPN PROP|F|S|@ADVL> Gender=Fem|Number=Sing 6 dep _ _ +2 Pesquisa Pesquisa PROPN _ Gender=Fem|Number=Sing 6 nsubj _ ChangedBy=Issue119|MWE=Pesquisa_Datafolha|MWEPOS=PROPN +3 Datafolha Datafolha PROPN _ Number=Sing 2 flat:name _ ChangedBy=Issue119 +4 publicada publicar VERB |V|PCP|F|S|@ICL-N< Gender=Fem|Number=Sing|VerbForm=Part 2 acl _ _ +5 hoje hoje ADV ADV|@|V|PR|3S|IND|@FS-STA Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ +7 um um DET |ART|M|S|@>N Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 8 det _ _ +8 dado dado NOUN |N|M|S|@|V|GER|@ICL-ADVL> VerbForm=Ger 26 advcl _ _ +12 uma um DET |ART|F|S|@>N Definite=Ind|Gender=Fem|Number=Sing|PronType=Art 13 det _ _ +13 postura postura NOUN |N|F|S|@|ART|F|S|@>N Definite=Def|Gender=Fem|Number=Sing|PronType=Art 18 det _ _ +17 esmagadora esmagador ADJ ADJ|F|S|@>N Gender=Fem|Number=Sing 18 amod _ _ +18 maioria maioria NOUN |N|F|S|@SUBJ> Gender=Fem|Number=Sing 26 nsubj _ _ +19 ( ( PUNCT PU|@PU _ 21 punct _ ChangedBy=Issue165|SpaceAfter=No +20 77 77 NUM |NUM|M|P|@>N NumType=Card 21 nummod _ ChangedBy=Issue165|ChangedBy=Issue168|SpaceAfter=No +21 % % SYM |N|M|P|@N|PRP|@N< _ 25 case _ _ +24 os o DET <-sam>||ART|M|P|@>N Definite=Def|Gender=Masc|Number=Plur|PronType=Art 25 det _ _ +25 eleitores eleitor NOUN |N|M|P|@P< Gender=Masc|Number=Plur 18 nmod _ _ +26 quer querer VERB |V|PR|3S|IND|@FS-N|ART|M|S|@>N Definite=Def|Gender=Masc|Number=Sing|PronType=Art 28 det _ _ +28 PT PT PROPN PROP|M|S|@|V|GER|@ICL-|PRP|@||ART|M|S|@>N Definite=Def|Gender=Masc|Number=Sing|PronType=Art 32 det _ _ +32 Governo governo NOUN ||N|M|S|@P< Gender=Masc|Number=Sing 29 obl _ _ +33 Fernando Fernando PROPN _ Gender=Masc|Number=Sing 32 nmod _ ChangedBy=Issue119|MWE=Fernando_Henrique_Cardoso|MWEPOS=PROPN +34 Henrique Henrique PROPN _ Number=Sing 33 flat:name _ ChangedBy=Issue119 +35 Cardoso Cardoso PROPN _ Number=Sing 33 flat:name _ SpaceAfter=No +36 . . PUNCT PU|@PU _ 6 punct _ _ + +# text = Tem sentido -- aliás, muitíssimo sentido. +# source = CETENFolha n=1 cad=Opinião sec=opi sem=94a &D +# sent_id = CF1-4 +# id = 3 +1 Tem ter VERB |V|PR|3S|IND|@FS-STA Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ +2 sentido sentido NOUN |N|M|S|@|ADV|@||DET|M|S|@>N Gender=Masc|Number=Sing|PronType=Ind 7 det _ _ +7 sentido sentido NOUN |N|M|S|@N|ADV|@>A _ 2 advmod _ _ +2 mais mais ADV |||ADV|@ADVL> _ 22 advmod _ _ +3-4 do _ _ _ _ _ _ _ _ +3 de de ADP |PRP|@COM _ 8 case _ _ +4 o o PRON |<-sam>|DET|M|S|@P< Gender=Masc|Number=Sing|PronType=Dem 3 fixed _ _ +5 que que PRON |INDP|M|S|@N< Gender=Masc|Number=Sing|PronType=Rel 3 fixed _ _ +6-7 nos _ _ _ _ _ _ _ _ +6 em em ADP ||PRP|@KOMP< _ 8 case _ _ +7 os o DET <-sam>||ART|M|P|@>N Definite=Def|Gender=Masc|Number=Plur|PronType=Art 8 det _ _ +8 tempos tempo NOUN ||N|M|P|@P< Gender=Masc|Number=Plur 2 obl _ _ +9-10 na _ _ _ _ _ _ _ _ +9 em em ADP |PRP|@N< _ 11 case _ _ +10 a o DET <-sam>||ART|F|S|@>N Definite=Def|Gender=Fem|Number=Sing|PronType=Art 11 det _ _ +11 ditadura ditadura NOUN |N|F|S|@P< Gender=Fem|Number=Sing 8 nmod _ ChangedBy=Issue165|SpaceAfter=No +12 , , PUNCT PU|@PU _ 2 punct _ _ +13 a o DET |ART|F|S|@>N Definite=Def|Gender=Fem|Number=Sing|PronType=Art 14 det _ _ +14 solidez solidez NOUN |N|F|S|@SUBJ> Gender=Fem|Number=Sing 22 nsubj _ _ +15-16 do _ _ _ _ _ _ _ _ +15 de de ADP |PRP|@N< _ 17 case _ _ +16 o o DET <-sam>||ART|M|S|@>N Definite=Def|Gender=Masc|Number=Sing|PronType=Art 17 det _ _ +17 PT PT PROPN PROP|M|S|@P< Gender=Masc|Number=Sing 14 nmod _ _ +18 está estar AUX |V|PR|3S|IND|@FS-STA Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 22 cop _ ChangedBy=Issue165|ChangedBy=Issue167|SpaceAfter=No +19 , , PUNCT PU|@PU _ 20 punct _ _ +20 agora agora ADV |ADV|@|V|PCP|F|S|@ICL-|KC|@CO _ 2 cc _ _ +2 Lula Lula PROPN |PROP|M|S|@SUBJ> Gender=Masc|Number=Sing 7 nsubj _ _ +3 nem nem CCONJ ||KC|@CO _ 5 cc _ _ +4 o o DET |ART|M|S|@>N Definite=Def|Gender=Masc|Number=Sing|PronType=Art 5 det _ _ +5 partido partido NOUN ||N|M|S|@SUBJ> Gender=Masc|Number=Sing 2 conj _ _ +6 ainda ainda ADV ADV|@ADVL> _ 7 advmod _ _ +7 encontraram encontrar VERB |V|PS/MQP|3P|IND|@FS-STA Mood=Ind|Number=Plur|Person=3|VerbForm=Fin 0 root _ _ +8 um um DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 9 det _ _ +9 discurso discurso NOUN |N|M|S|@-PASS Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs 12 expl _ ChangedBy=Issue135 +12 diferenciar diferenciar VERB _ VerbForm=Inf 9 acl _ ChangedBy=Issue165|SpaceAfter=No +13 . . PUNCT PU|@PU _ 7 punct _ _ + +# text = Eles se dizem oposição, mas ainda não informaram o que vão combater. +# source = CETENFolha n=1 cad=Opinião sec=opi sem=94a +# sent_id = CF1-7 +# id = 6 +1 Eles eles PRON PERS|M|3P|NOM|@SUBJ> Case=Nom|Gender=Masc|Number=Plur|Person=3|PronType=Prs 3 nsubj _ _ +2 se se PRON PERS|M|3P|ACC|@ACC>-PASS Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs 3 expl _ ChangedBy=Issue135 +3 dizem dizer VERB |||V|PR|3P|IND|@FS-STA Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ +4 oposição oposição NOUN |N|F|S|@|KC|@CO _ 9 cc _ _ +7 ainda ainda ADV ADV|@>A _ 8 advmod _ _ +8 não não ADV _ Polarity=Neg 9 advmod _ _ +9 informaram informar VERB ||V|PS/MQP|3P|IND|@FS-STA Mood=Ind|Number=Plur|Person=3|VerbForm=Fin 3 conj _ _ +10 o o PRON _ Gender=Masc|Number=Sing|PronType=Dem 11 det _ _ +11 que que PRON |INDP|M|S|@ACC> Gender=Masc|Number=Sing|PronType=Int 13 obj _ _ +12 vão ir AUX |V|PR|3P|IND|@FS-|V|INF|@ICL-AUX< VerbForm=Inf 9 ccomp _ ChangedBy=Issue165|SpaceAfter=No +14 . . PUNCT PU|@PU _ 3 punct _ _ + +# text = Muitas das prioridades do novo governo coincidem com as prioridades do PT. +# source = CETENFolha n=1 cad=Opinião sec=opi sem=94a +# sent_id = CF1-8 +# id = 7 +1 Muitas muito PRON |DET|F|P|@SUBJ> Gender=Fem|Number=Plur|PronType=Ind 9 nsubj _ _ +2-3 das _ _ _ _ _ _ _ _ +2 de de ADP |PRP|@N< _ 4 case _ _ +3 as o DET <-sam>||ART|F|P|@>N Definite=Def|Gender=Fem|Number=Plur|PronType=Art 4 det _ _ +4 prioridades prioridade NOUN |N|F|P|@P< Gender=Fem|Number=Plur 1 nmod _ _ +5-6 do _ _ _ _ _ _ _ _ +5 de de ADP |PRP|@N< _ 8 case _ _ +6 o o DET <-sam>||ART|M|S|@>N Definite=Def|Gender=Masc|Number=Sing|PronType=Art 8 det _ _ +7 novo novo ADJ ADJ|M|S|@>N Gender=Masc|Number=Sing 8 amod _ _ +8 governo governo NOUN |N|M|S|@P< Gender=Masc|Number=Sing 4 nmod _ _ +9 coincidem coincidir VERB |V|PR|3P|IND|@FS-STA Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ +10 com com ADP PRP|@|ART|F|P|@>N Definite=Def|Gender=Fem|Number=Plur|PronType=Art 12 det _ _ +12 prioridades prioridade NOUN |N|F|P|@P< Gender=Fem|Number=Plur 9 obj _ _ +13-14 do _ _ _ _ _ _ _ _ +13 de de ADP |PRP|@N< _ 15 case _ _ +14 o o DET <-sam>||ART|M|S|@>N Definite=Def|Gender=Masc|Number=Sing|PronType=Art 15 det _ _ +15 PT PT PROPN PROP|M|S|@P< Gender=Masc|Number=Sing 12 nmod _ ChangedBy=Issue165|SpaceAfter=No +16 . . PUNCT PU|@PU _ 9 punct _ _ + diff --git a/samples/CoNLL-U/en-ud-test-abridged.conllu b/samples/CoNLL-U/en-ud-test-abridged.conllu new file mode 100644 index 00000000..b07e4ac8 --- /dev/null +++ b/samples/CoNLL-U/en-ud-test-abridged.conllu @@ -0,0 +1,122 @@ +# newdoc id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200 +# sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0001 +# text = What if Google Morphed Into GoogleOS? +1 What what PRON WP PronType=Int 0 root 0:root _ +2 if if SCONJ IN _ 4 mark 4:mark _ +3 Google Google PROPN NNP Number=Sing 4 nsubj 4:nsubj _ +4 Morphed morph VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 advcl 1:advcl _ +5 Into into ADP IN _ 6 case 6:case _ +6 GoogleOS GoogleOS PROPN NNP Number=Sing 4 obl 4:obl SpaceAfter=No +7 ? ? PUNCT . _ 4 punct 4:punct _ + +# sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0002 +# text = What if Google expanded on its search-engine (and now e-mail) wares into a full-fledged operating system? +1 What what PRON WP PronType=Int 0 root 0:root _ +2 if if SCONJ IN _ 4 mark 4:mark _ +3 Google Google PROPN NNP Number=Sing 4 nsubj 4:nsubj _ +4 expanded expand VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 advcl 1:advcl _ +5 on on ADP IN _ 15 case 15:case _ +6 its its PRON PRP$ Gender=Neut|Number=Sing|Person=3|Poss=Yes|PronType=Prs 15 nmod:poss 15:nmod:poss _ +7 search search NOUN NN Number=Sing 9 compound 9:compound SpaceAfter=No +8 - - PUNCT HYPH _ 9 punct 9:punct SpaceAfter=No +9 engine engine NOUN NN Number=Sing 15 compound 15:compound _ +10 ( ( PUNCT -LRB- _ 9 punct 9:punct SpaceAfter=No +11 and and CCONJ CC _ 13 cc 13:cc _ +12 now now ADV RB _ 13 advmod 13:advmod _ +13 e-mail e-mail NOUN NN Number=Sing 9 conj 9:conj SpaceAfter=No +14 ) ) PUNCT -RRB- _ 15 punct 15:punct _ +15 wares wares NOUN NNS Number=Plur 4 obl 4:obl _ +16 into into ADP IN _ 22 case 22:case _ +17 a a DET DT Definite=Ind|PronType=Art 22 det 22:det _ +18 full full ADV RB _ 20 advmod 20:advmod SpaceAfter=No +19 - - PUNCT HYPH _ 20 punct 20:punct SpaceAfter=No +20 fledged fledged ADJ JJ Degree=Pos 22 amod 22:amod _ +21 operating operating NOUN NN Number=Sing 22 compound 22:compound _ +22 system system NOUN NN Number=Sing 4 obl 4:obl SpaceAfter=No +23 ? ? PUNCT . _ 4 punct 4:punct _ + +# sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0003 +# text = [via Microsoft Watch from Mary Jo Foley ] +1 [ [ PUNCT -LRB- _ 4 punct 4:punct SpaceAfter=No +2 via via ADP IN _ 4 case 4:case _ +3 Microsoft Microsoft PROPN NNP Number=Sing 4 compound 4:compound _ +4 Watch Watch PROPN NNP Number=Sing 0 root 0:root _ +5 from from ADP IN _ 6 case 6:case _ +6 Mary Mary PROPN NNP Number=Sing 4 nmod 4:nmod _ +7 Jo Jo PROPN NNP Number=Sing 6 flat 6:flat _ +8 Foley Foley PROPN NNP Number=Sing 6 flat 6:flat _ +9 ] ] PUNCT -RRB- _ 4 punct 4:punct _ + +# newdoc id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700 +# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0001 +# text = (And, by the way, is anybody else just a little nostalgic for the days when that was a good thing?) +1 ( ( PUNCT -LRB- _ 14 punct 14:punct SpaceAfter=No +2 And and CCONJ CC _ 14 cc 14:cc SpaceAfter=No +3 , , PUNCT , _ 14 punct 14:punct _ +4 by by ADP IN _ 6 case 6:case _ +5 the the DET DT Definite=Def|PronType=Art 6 det 6:det _ +6 way way NOUN NN Number=Sing 14 obl 14:obl SpaceAfter=No +7 , , PUNCT , _ 14 punct 14:punct _ +8 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 14 cop 14:cop _ +9 anybody anybody PRON NN Number=Sing 14 nsubj 14:nsubj _ +10 else else ADJ JJ Degree=Pos 9 amod 9:amod _ +11 just just ADV RB _ 13 advmod 13:advmod _ +12 a a DET DT Definite=Ind|PronType=Art 13 det 13:det _ +13 little little ADJ JJ Degree=Pos 14 obl:npmod 14:obl:npmod _ +14 nostalgic nostalgic NOUN NN Number=Sing 0 root 0:root _ +15 for for ADP IN _ 17 case 17:case _ +16 the the DET DT Definite=Def|PronType=Art 17 det 17:det _ +17 days day NOUN NNS Number=Plur 14 nmod 14:nmod _ +18 when when ADV WRB PronType=Rel 23 advmod 23:advmod _ +19 that that PRON DT Number=Sing|PronType=Dem 23 nsubj 23:nsubj _ +20 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 23 cop 23:cop _ +21 a a DET DT Definite=Ind|PronType=Art 23 det 23:det _ +22 good good ADJ JJ Degree=Pos 23 amod 23:amod _ +23 thing thing NOUN NN Number=Sing 17 acl:relcl 17:acl:relcl SpaceAfter=No +24 ? ? PUNCT . _ 14 punct 14:punct SpaceAfter=No +25 ) ) PUNCT -RRB- _ 14 punct 14:punct _ + +# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0002 +# text = This BuzzMachine post argues that Google's rush toward ubiquity might backfire -- which we've all heard before, but it's particularly well-put in this post. +1 This this DET DT Number=Sing|PronType=Dem 3 det 3:det _ +2 BuzzMachine BuzzMachine PROPN NNP Number=Sing 3 compound 3:compound _ +3 post post NOUN NN Number=Sing 4 nsubj 4:nsubj _ +4 argues argue VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _ +5 that that SCONJ IN _ 12 mark 12:mark _ +6 Google Google PROPN NNP Number=Sing 8 nmod:poss 8:nmod:poss SpaceAfter=No +7 's 's PART POS _ 6 case 6:case _ +8 rush rush NOUN NN Number=Sing 12 nsubj 12:nsubj _ +9 toward toward ADP IN _ 10 case 10:case _ +10 ubiquity ubiquity NOUN NN Number=Sing 8 nmod 8:nmod _ +11 might might AUX MD VerbForm=Fin 12 aux 12:aux _ +12 backfire backfire VERB VB VerbForm=Inf 4 ccomp 4:ccomp _ +13 -- -- PUNCT , _ 12 punct 12:punct _ +14 which which PRON WDT PronType=Rel 18 obj 18:obj _ +15 we we PRON PRP Case=Nom|Number=Plur|Person=1|PronType=Prs 18 nsubj 18:nsubj SpaceAfter=No +16 've have AUX VBP Mood=Ind|Tense=Pres|VerbForm=Fin 18 aux 18:aux _ +17 all all ADV RB _ 18 advmod 18:advmod _ +18 heard hear VERB VBN Tense=Past|VerbForm=Part 12 acl:relcl 12:acl:relcl _ +19 before before ADV RB _ 18 advmod 18:advmod SpaceAfter=No +20 , , PUNCT , _ 27 punct 27:punct _ +21 but but CCONJ CC _ 27 cc 27:cc _ +22 it it PRON PRP Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs 27 nsubj:pass 27:nsubj:pass SpaceAfter=No +23 's be VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 27 aux:pass 27:aux:pass _ +24 particularly particularly ADV RB _ 27 advmod 27:advmod _ +25 well well ADV RB Degree=Pos 27 advmod 27:advmod SpaceAfter=No +26 - - PUNCT HYPH _ 27 punct 27:punct SpaceAfter=No +27 put put VERB VBN Tense=Past|VerbForm=Part 4 conj 4:conj _ +28 in in ADP IN _ 30 case 30:case _ +29 this this DET DT Number=Sing|PronType=Dem 30 det 30:det _ +30 post post NOUN NN Number=Sing 27 obl 27:obl SpaceAfter=No +31 . . PUNCT . _ 4 punct 4:punct _ + +# sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0003 +# text = Google is a nice search engine. +1 Google Google PROPN NNP Number=Sing 6 nsubj 6:nsubj _ +2 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 cop 6:cop _ +3 a a DET DT Definite=Ind|PronType=Art 6 det 6:det _ +4 nice nice ADJ JJ Degree=Pos 6 amod 6:amod _ +5 search search NOUN NN Number=Sing 6 compound 6:compound _ +6 engine engine NOUN NN Number=Sing 0 root 0:root SpaceAfter=No +7 . . PUNCT . _ 6 punct 6:punct _ + diff --git a/samples/CoNLL-U/ug-ud-test-abridged.conllu b/samples/CoNLL-U/ug-ud-test-abridged.conllu new file mode 100644 index 00000000..bf08ffb0 --- /dev/null +++ b/samples/CoNLL-U/ug-ud-test-abridged.conllu @@ -0,0 +1,121 @@ +# sent_id = s1 +# text = ئاسماننى كۆپكۈك، دەريا، كۆل سۇلىرىنى سۈپسۈزۈك تۇرۇشقا، دەل - دەرەخلەرنى بۈك - باراقسان بولۇشقا، ھايۋانلارنى ئەركىن ئازادە ياشاشقا ئىگە قىلىش... بىزنىڭ ئورتاق ئارزۇيىمىز. +1 ئاسماننى _ NOUN N _ 30 csubj _ Translit=asmanni +2 كۆپكۈك _ VERB V _ 1 orphan _ SpaceAfter=No|Translit=köpkük +3 ، _ PUNCT Y _ 2 punct _ Translit=, +4 دەريا _ NOUN N _ 7 nmod:poss _ SpaceAfter=No|Translit=derya +5 ، _ PUNCT Y _ 4 punct _ Translit=, +6 كۆل _ NOUN N _ 4 conj _ Translit=köl +7 سۇلىرىنى _ NOUN N _ 9 obj _ Translit=sulirini +8 سۈپسۈزۈك _ ADJ A _ 9 advmod _ Translit=süpsüzük +9 تۇرۇشقا _ VERB V _ 1 conj _ SpaceAfter=No|Translit=turushqa +10 ، _ PUNCT Y _ 1 punct _ Translit=, +11 دەل _ ADV D _ 13 compound:redup _ Translit=del +12 - _ PUNCT Y _ 11 punct _ Translit=- +13 دەرەخلەرنى _ NOUN N _ 17 obj _ Translit=derexlerni +14 بۈك _ ADJ A _ 16 compound _ Translit=bük +15 - _ PUNCT Y _ 14 punct _ Translit=- +16 باراقسان _ ADJ A _ 17 advmod _ Translit=baraqsan +17 بولۇشقا _ VERB V _ 9 orphan _ SpaceAfter=No|Translit=bolushqa +18 ، _ PUNCT Y _ 17 punct _ Translit=, +19 ھايۋانلارنى _ NOUN N _ 24 obj _ Translit=haywanlarni +20 ئەركىن _ ADJ A _ 21 compound:redup _ Translit=erkin +21 ئازادە _ ADJ A _ 22 advmod _ Translit=azade +22 ياشاشقا _ NOUN N _ 24 advcl _ Translit=yashashqa +23 ئىگە _ NOUN N _ 24 compound _ Translit=ige +24 قىلىش _ VERB V _ 1 conj _ SpaceAfter=No|Translit=qilish +25 . _ PUNCT Y _ 1 punct _ SpaceAfter=No|Translit=. +26 . _ PUNCT Y _ 1 punct _ SpaceAfter=No|Translit=. +27 . _ PUNCT Y _ 1 punct _ Translit=. +28 بىزنىڭ _ PRON P _ 30 nmod:poss _ Translit=bizning +29 ئورتاق _ ADJ A _ 30 amod _ Translit=ortaq +30 ئارزۇيىمىز _ NOUN N _ 0 root _ SpaceAfter=No|Translit=arzuyimiz +31 . _ PUNCT Y _ 30 punct _ Translit=. + +# sent_id = s2 +# text = بۇ بۆلەكتىكى تېكىستلەرنى ئوقۇش ئارقىلىق، كىشىلەرنىڭ ھايۋانلار ۋە ئۆسۈملۈكلەرگە قانداق مۇئامىلە قىلغانلىقى، ئاقىۋىتىنىڭ قانداق بولغانلىقىنى كۆرۈپ باقايلى، +1 بۇ _ PRON P _ 2 det _ Translit=bu +2 بۆلەكتىكى _ NOUN N _ 3 nmod _ Translit=bölektiki +3 تېكىستلەرنى _ NOUN N _ 4 obj _ Translit=tëkistlerni +4 ئوقۇش _ VERB V _ 18 advcl _ Translit=oqush +5 ئارقىلىق _ ADP R _ 4 case _ SpaceAfter=No|Translit=arqiliq +6 ، _ PUNCT Y _ 5 punct _ Translit=, +7 كىشىلەرنىڭ _ NOUN N _ 13 nsubj _ Translit=kishilerning +8 ھايۋانلار _ NOUN N _ 13 obl _ Translit=haywanlar +9 ۋە _ CCONJ C _ 10 cc _ Translit=we +10 ئۆسۈملۈكلەرگە _ NOUN N _ 8 conj _ Translit=ösümlüklerge +11 قانداق _ PRON P _ 13 advmod _ Translit=qandaq +12 مۇئامىلە _ NOUN N _ 13 compound _ Translit=muamile +13 قىلغانلىقى _ VERB V _ 18 conj _ SpaceAfter=No|Translit=qilghanliqi +14 ، _ PUNCT Y _ 13 punct _ Translit=, +15 ئاقىۋىتىنىڭ _ NOUN N _ 17 nsubj _ Translit=aqiwitining +16 قانداق _ PRON P _ 17 advmod _ Translit=qandaq +17 بولغانلىقىنى _ VERB V _ 18 obj _ Translit=bolghanliqini +18 كۆرۈپ _ VERB V _ 0 root _ Translit=körüp +19 باقايلى _ VERB V _ 18 aux _ SpaceAfter=No|Translit=baqayli +20 ، _ PUNCT Y _ 19 punct _ Translit=, + +# sent_id = s3 +# text = يەنە ئەتراپىمىزدىكى مۇھىتنى ياخشى كۆزىتىپ، مۇھىتنى قوغداش ئۈچۈن نېمىلەرنى قىلالايدىغانلىقىمىز توغرۇلۇق ئويلىنىپ باقايلى. +1 يەنە _ ADV D _ 13 cc _ Translit=yene +2 ئەتراپىمىزدىكى _ NOUN N _ 3 amod _ Translit=etrapimizdiki +3 مۇھىتنى _ NOUN N _ 5 obj _ Translit=muhitni +4 ياخشى _ ADJ A _ 5 advmod _ Translit=yaxshi +5 كۆزىتىپ _ VERB V _ 13 advcl _ SpaceAfter=No|Translit=közitip +6 ، _ PUNCT Y _ 5 punct _ Translit=, +7 مۇھىتنى _ NOUN N _ 8 obj _ Translit=muhitni +8 قوغداش _ VERB V _ 11 advcl _ Translit=qoghdash +9 ئۈچۈن _ CCONJ C _ 8 case _ Translit=üchün +10 نېمىلەرنى _ PRON P _ 11 obj _ Translit=nëmilerni +11 قىلالايدىغانلىقىمىز _ VERB V _ 13 obj _ Translit=qilalaydighanliqimiz +12 توغرۇلۇق _ ADP R _ 11 case _ Translit=toghruluq +13 ئويلىنىپ _ VERB V _ 0 root _ Translit=oylinip +14 باقايلى _ VERB V _ 13 aux _ SpaceAfter=No|Translit=baqayli +15 . _ PUNCT Y _ 14 punct _ Translit=. + +# sent_id = s4 +# text = بىر يىلى باھار كۈنلىرىنىڭ بىرىدە، شىۋېتسارىيىنىڭ بىر ۋوگزالىدا ھاۋا تەڭشىگۈچ ئورنىتىلغان چىرايلىق، ئازادە بىر پويىز قوزغىلىش ئالدىدا تۇراتتى. +1 بىر _ NUM M _ 2 nummod _ Translit=bir +2 يىلى _ NOUN N _ 20 nmod:tmod _ Translit=yili +3 باھار _ NOUN N _ 4 nmod:poss _ Translit=bahar +4 كۈنلىرىنىڭ _ NOUN N _ 5 nmod:part _ Translit=künlirining +5 بىرىدە _ NUM M _ 20 nmod:tmod _ SpaceAfter=No|Translit=biride +6 ، _ PUNCT Y _ 5 punct _ Translit=, +7 شىۋېتسارىيىنىڭ _ NOUN N _ 9 nmod:poss _ Translit=shiwëtsariyining +8 بىر _ NUM M _ 9 det _ Translit=bir +9 ۋوگزالىدا _ NOUN N _ 20 obl _ Translit=wogzalida +10 ھاۋا _ NOUN N _ 11 compound _ Translit=hawa +11 تەڭشىگۈچ _ NOUN N _ 12 nsubj _ Translit=tengshigüch +12 ئورنىتىلغان _ NOUN N _ 17 amod _ Translit=ornitilghan +13 چىرايلىق _ ADJ A _ 17 amod _ SpaceAfter=No|Translit=chirayliq +14 ، _ PUNCT Y _ 13 punct _ Translit=, +15 ئازادە _ ADJ A _ 13 conj _ Translit=azade +16 بىر _ NUM M _ 17 det _ Translit=bir +17 پويىز _ NOUN N _ 20 nsubj _ Translit=poyiz +18 قوزغىلىش _ VERB V _ 19 nmod:poss _ Translit=qozghilish +19 ئالدىدا _ NOUN N _ 20 obl _ Translit=aldida +20 تۇراتتى _ VERB V _ 0 root _ SpaceAfter=No|Translit=turatti +21 . _ PUNCT Y _ 20 punct _ Translit=. + +# sent_id = s5 +# text = ۋوگزال سۇپىسى ئۇزاتقۇچىلار بىلەن تولۇپ كەتكەنىدى. +1 ۋوگزال _ NOUN N _ 2 nmod:poss _ Translit=wogzal +2 سۇپىسى _ NOUN N _ 5 nsubj _ Translit=supisi +3 ئۇزاتقۇچىلار _ NOUN N _ 5 obl _ Translit=uzatquchilar +4 بىلەن _ ADP R _ 3 case _ Translit=bilen +5 تولۇپ _ VERB V _ 0 root _ Translit=tolup +6 كەتكەنىدى _ VERB V _ 5 aux _ SpaceAfter=No|Translit=ketkenidi +7 . _ PUNCT Y _ 6 punct _ Translit=. + +# sent_id = s6 +# text = ئۇلارنىڭ ئۇزاتماقچى بولغىنى ئۆزگىچە مىھمان - قارلىغاچلار ئىدى. +1 ئۇلارنىڭ _ PRON P _ 2 nsubj _ Translit=ularning +2 ئۇزاتماقچى _ NOUN N _ 5 acl _ Translit=uzatmaqchi +3 بولغىنى _ AUX V _ 2 cop _ Translit=bolghini +4 ئۆزگىچە _ ADJ A _ 5 amod _ Translit=özgiche +5 مىھمان _ NOUN N _ 7 appos _ Translit=mihman +6 - _ PUNCT Y _ 5 punct _ Translit=- +7 قارلىغاچلار _ NOUN N _ 0 root _ Translit=qarlighachlar +8 ئىدى _ AUX V _ 7 cop _ SpaceAfter=No|Translit=idi +9 . _ PUNCT Y _ 8 punct _ Translit=. + diff --git a/vendor/README.md b/vendor/README.md index b5b7401a..16797c5d 100644 --- a/vendor/README.md +++ b/vendor/README.md @@ -71,6 +71,7 @@ This is a list of grammars that Linguist selects to provide syntax highlighting - **Common Lisp:** [textmate/lisp.tmbundle](https://github.com/textmate/lisp.tmbundle) - **Common Workflow Language:** [manabuishii/language-cwl](https://github.com/manabuishii/language-cwl) - **Component Pascal:** [textmate/pascal.tmbundle](https://github.com/textmate/pascal.tmbundle) +- **CoNLL-U:** [odanoburu/conllu-linguist-grammar](https://github.com/odanoburu/conllu-linguist-grammar) - **Cool:** [anunayk/cool-tmbundle](https://github.com/anunayk/cool-tmbundle) - **Coq:** [mkolosick/Sublime-Coq](https://github.com/mkolosick/Sublime-Coq) - **Cpp-ObjDump:** [nanoant/assembly.tmbundle](https://github.com/nanoant/assembly.tmbundle) diff --git a/vendor/grammars/conllu-linguist-grammar b/vendor/grammars/conllu-linguist-grammar new file mode 160000 index 00000000..e9710323 --- /dev/null +++ b/vendor/grammars/conllu-linguist-grammar @@ -0,0 +1 @@ +Subproject commit e9710323ce3e41afb3723c936d65597ed8cb684e diff --git a/vendor/licenses/grammar/conllu-linguist-grammar.txt b/vendor/licenses/grammar/conllu-linguist-grammar.txt new file mode 100644 index 00000000..75db34d5 --- /dev/null +++ b/vendor/licenses/grammar/conllu-linguist-grammar.txt @@ -0,0 +1,206 @@ +--- +type: grammar +name: conllu-linguist-grammar +license: apache-2.0 +--- + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2018 bruno cuconato + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License.