Files
linguist/samples/M/pcre.m

512 lines
18 KiB
Mathematica

;
; GT.M PCRE Extension
; Copyright (C) 2012 Piotr Koper <piotr.koper@gmail.com>
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU Affero General Public License as
; published by the Free Software Foundation, either version 3 of the
; License, or (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU Affero General Public License for more details.
;
; You should have received a copy of the GNU Affero General Public License
; along with this program. If not, see <http://www.gnu.org/licenses/>.
;
; GT.M is a trademark of Fidelity Information Services, Inc.
; "GT.M is a vetted industrial strength, transaction processing application
; platform consisting of a key-value database engine optimized for extreme
; transaction processing throughput & business continuity."
; -- http://sourceforge.net/projects/fis-gtm/
; GT.M PCRE Extension
;
; This extension tries to deliver the best possible PCRE interface for the
; M world by providing a support for PCRE with M arrays, stringified parameter
; names, simplified API, locales, exceptions and Perl5 Global Match.
;
; See pcreexamples.m for comprehensive examples on ^pcre routines usage and
; beginner level tips on match limits, exception handling and UTF-8 in GT.M.
;
; Try out the best known book on regular expressions: http://regex.info/
; For more information on PCRE see: http://pcre.org/
;
; Please feel free to contact me if you have any questions or comments,
; Piotr Koper <piotr.koper@gmail.com>
;
pcre ;GT.M PCRE Extension
;1.0;Initial release;pkoper
q
version()
q $&pcre.version()
config(name)
; name is one of: (case insensitive)
; "UTF8", "NEWLINE", "LINK_SIZE", "POSIX_MALLOC_THRESHOLD",
; "MATCH_LIMIT", "MATCH_LIMIT_RECURSION", "STACKRECURSE",
; "BSR", "UNICODE_PROPERTIES", "JIT", "JITTARGET"
;
d protect
;
n erropt,isstring,s,n,code
s code=$&pcre.config(.name,.erropt,.isstring,.s,.n)
s:code $ec=",U"_(-code)_","
q $s(isstring:s,1:n)
compile(pattern,options,locale,mlimit,reclimit)
; options is case insensitive and optional string with "|" joined:
; "ANCHORED", "CASELESS", "DOLLAR_ENDONLY", "DOTALL", "EXTENDED",
; "FIRSTLINE", "MULTILINE", "NO_AUTO_CAPTURE", "DUPNAMES",
; "UNGREEDY", "BSR_ANYCRLF", "BSR_UNICODE", "JAVASCRIPT_COMPAT",
; "NL_ANY", "NL_ANYCRLF", "NL_CR", "NL_CRLF","NL_LF",
; "UTF8", "UCP", "NO_UTF8_CHECK"
;
; locale is an optional Unix locale name used for pcre_maketables(),
; cases:
; undefined or "":
; pcre_maketables() will not be called
; "ENV" (case insensitive):
; use locale in program environment defined by the
; environment variables LANG or LC_*
; specified:
; "pl_PL.iso88592", "pl_PL.utf8", "C", ...
; see locale(1), locale(2) and the output of command:
; $ locale -a
; Debian tip: use
; $ dpkg-reconfigure locales
; to enable or set system-wide locale
;
; mlimit (optional) limits the number of internal matching function
; calls in pcre_exec() execution, see PCRE manual for details
;
; reclimit (optional) limit for the depth of recursion when calling
; the internal matching function in a pcre_exec() execution,
; see PCRE manual for details
;
d protect
;
n erropt,ref,err,erroffset,code
s code=$&pcre.compile(.pattern,$g(options),.erropt,.ref,.err,.erroffset,$g(locale),$g(mlimit,0),$g(reclimit,0))
s:code $ec=",U"_(-code)_","
q ref
exec(ref,subject,options,startoffset,length)
; options is case insensitive and optional string with "|" joined:
; "ANCHORED", "BSR_ANYCRLF", "BSR_UNICODE",
; "NL_ANY", "NL_ANYCRLF", "NL_CR", "NL_CRLF", "NL_LF",
; "NOTBOL", "NOTEOL", "NOTEMPTY", "NOTEMPTY_ATSTART",
; "NO_START_OPTIMIZE", "NO_UTF8_CHECK",
; "PARTIAL_SOFT", "PARTIAL_HARD"
;
; startoffset is in octets, starts with 1 (like in M) (optional)
;
; length is subject length in octets, not chars (optional)
;
d protect
;
n erropt,code,start
s start=$g(startoffset,1)-1
s code=$&pcre.exec(.ref,.subject,$g(length,$zl(subject)),start,$g(options),.erropt)
s:code<0 $ec=",U"_(-code)_","
q code
ovector(ref,i) ; return i-element from ovector
d protect
;
n n,code
s code=$&pcre.ovector(.ref,.i,.n)
s:code $ec=",U"_(-code)_","
;s $ec=",U123,"
q n
ovecsize(ref) ; return ovecsize
d protect
;
n n,code
s code=$&pcre.ovecsize(.ref,.n)
s:code $ec=",U"_(-code)_","
q n
fullinfo(ref,name)
; name is one of: (case insensitive)
; "OPTIONS", "SIZE", "CAPTURECOUNT", "BACKREFMAX", "FIRSTBYTE",
; "FIRSTTABLE", "LASTLITERAL", "NAMEENTRYSIZE", "NAMECOUNT",
; "STUDYSIZE", "OKPARTIAL", "JCHANGED", "HASCRORLF", "MINLENGTH",
; "JIT", "JITSIZE"
; for NAME* options see also $$nametable^pcre()
;
d protect
;
n erropt,isstring,s,n,code
s code=$&pcre.fullinfo(.ref,.name,.erropt,.isstring,.s,.n)
s:code $ec=",U"_(-code)_","
q $s(isstring:s,1:n)
nametable(ref,i,n) ; returns index (n) and name, or { 0, "" } for invalid i
; i is indexed from 1
;
d protect
;
n s,code
s code=$&pcre.nametable(.ref,.i,.n,.s)
s:code $ec=",U"_(-code)_","
q s
substring(ref,i,begin,end)
s begin=$$ovector(.ref,i*2)+1,end=$$ovector(.ref,i*2+1)
; ovector contains octet indexed data not UNICODE chars, so $ze is used
q:'begin ""
q $s($g(o,0):begin_","_end,1:$ze(subject,begin,end))
store(ref,i,n,o,key) ; same as above but stores captured data in n array
n begin,end
s begin=$$ovector(.ref,i*2)+1,end=$$ovector(.ref,i*2+1)
q:'begin
s key=$g(key,i)
s:o n(key,0)=begin,n(key,1)=end
s n(key)=$ze(subject,begin,end)
q
gstore(ref,i,n,round,byref,o,key) ; store for global match
n begin,end
s begin=$$ovector(.ref,i*2)+1,end=$$ovector(.ref,i*2+1)
q:'begin
s key=$g(key,i)
i byref d
. s:o n(key,round,0)=begin,n(key,round,1)=end
. s n(key,round)=$ze(subject,begin,end)
e d
. s:o n(round,key,0)=begin,n(round,key,1)=end
. s n(round,key)=$ze(subject,begin,end)
q
test(subject,pattern,options,locale,mlimit,reclimit)
; see $$compile^pcre for options, locale, mlimit and reclimit
;
d protect
n ref,l
s ref=$$compile(.pattern,$g(options),$g(locale),$g(mlimit,0),$g(reclimit,0))
s l=$$exec(.ref,.subject)
d free(.ref)
q l
match(subject,pattern,match,capture,options,locale,mlimit,reclimit)
; see $$compile^pcre for options, locale, mlimit and reclimit
;
; capture is case insensitive and optional string with "|" joined
; names or indexes to be capture
;
; extended options:
; "NAMED_ONLY" - capture only named groups
; "OVECTOR" - return additional ovector data
;
d protect
;
n namedonly,ovector,ref,o,l,i,j,s,c,begin
;
s options=$g(options),(namedonly,ovector)=0
f i=1:1:$l(options,"|") d
. s o=$zco($p(options,"|",i),"u")
. i o="NAMED_ONLY" s namedonly=1,$p(options,"|",i)=""
. i o="OVECTOR" s ovector=1,$p(options,"|",i)=""
s:namedonly options=options_"|NO_AUTO_CAPTURE"
;
k match
s ref=$$compile(.pattern,.options,$g(locale),$g(mlimit,0),$g(reclimit,0))
s l=$$exec(.ref,.subject)
i $d(capture) d
. s c="|"_capture_"|"
. ; ovector indexed data
. i 'namedonly f i=0:1:l-1 d:c[("|"_i_"|") store(.ref,.i,.match,.ovector)
. ; named matches data
. f i=1:1 s s=$$nametable(.ref,.i,.j) q:s="" d:c[("|"_s_"|") store(.ref,.j,.match,.ovector,.s)
e d
. i 'namedonly f i=0:1:l-1 d store(.ref,.i,.match,.ovector)
. f i=1:1 s s=$$nametable(.ref,.i,.j) q:s="" d store(.ref,.j,.match,.ovector,.s)
d free(.ref)
q:$q l q
global(subject,pattern,match,capture,options,locale,mlimit,reclimit)
; options is the same as for match^pcre, extended options:
; "OVECTOR" - return additional ovector data
; "GROUPED" - group the result in match array by pattern groups
; "NAMED_ONLY" - capture only named patterns
;
; see pcredemo.c and pcreccp.cc from PCRE for comments on procedure
; for Perl like global matching
;
d protect
;
n ref,c,o,ovector,byref,namedonly,utf8,crlf,start,end,matches,empty,skip,round,i,j,s,n,q
k match
;
; determine additional options and remove them before calling the compile^pcre
s options=$g(options),(ovector,byref,namedonly)=0
f i=1:1:$l(options,"|") d
. s o=$zco($p(options,"|",i),"u")
. i o="NAMED_ONLY" s namedonly=1,$p(options,"|",i)=""
. i o="GROUPED" s byref=1,$p(options,"|",i)=""
. i o="OVECTOR" s ovector=1,$p(options,"|",i)=""
s:namedonly options=options_"|NO_AUTO_CAPTURE"
;
; compile the pattern
s ref=$$compile(.pattern,.options,$g(locale),$g(mlimit,0),$g(reclimit,0))
;
s:$d(capture) c="|"_capture_"|"
s byref=$g(byref,0)
;
; check pattern options for UTF8 and double char new line
s o="|"_$$fullinfo(.ref,"OPTIONS")_"|"
s utf8=$s(o["|UTF8|":1,1:0)
s crlf=$s(o["|NL_CRLF|":1,o["|NL_ANY|":1,o["|NL_ANYCRLF|":1,1:0)
;
; if none check the PCRE build options
i crlf=0 d
. s o=$$config("NEWLINE")
. s crlf=$s(o="NL_CRLF":1,o="NL_ANY":1,o="NL_ANYCRLF":1,1:0)
;
s (start,round,i)=1,(empty,skip,q)=0
s end=$l(subject)+1
f d q:start>end!q
. i empty d
.. s matches=$$exec(.ref,.subject,"NOTEMPTY_ATSTART|ANCHORED",.start) ; unwind this call to optimize
.. q:matches ; quit this do, leave empty=1, store the matches
..
.. ; advance if no match & clear empty
.. s start=start+1
.. i start>end s q=1 q
..
.. ; skip LF if CR was before and CRLF mode
.. s:crlf&(($ze(subject,start-1)=$c(13))&($ze(subject,start)=$c(10))) start=start+1
..
.. ; skip if in a middle of UTF char
.. i utf8 f q:start'<end!($zbitand($c(0)_$ze(subject,start),$c(0)_$c(192))=$c(0)_$c(128)) s start=start+1
..
.. ; take into account skipped chars
.. s skip=1,empty=0
. e d
.. s matches=$$exec(.ref,.subject,,.start)
.. i 'matches s q=1 q
.
. q:q
. i skip s skip=0 q
.
. i $d(c) d
.. ; ovector indexed data
.. i 'namedonly f i=0:1:matches-1 d:c[("|"_i_"|") gstore(.ref,.i,.match,.round,.byref,.ovector)
.. ; named matches data
.. f i=1:1 s s=$$nametable(.ref,.i,.n) q:s="" d:c[("|"_s_"|") gstore(.ref,.n,.match,.round,.byref,.ovector,.s)
. e d
.. i 'namedonly f i=0:1:matches-1 d gstore(.ref,.i,.match,.round,.byref,.ovector)
.. f i=1:1 s s=$$nametable(.ref,.i,.n) q:s="" d gstore(.ref,.n,.match,.round,.byref,.ovector,.s)
. s round=round+1
.
. s start=$$ovector(.ref,1)+1
. s empty=(($$ovector(.ref,0)+1)=start)
d free(.ref)
q:$q round-1 q
replace(subject,pattern,subst,first,last,options,locale,mlimit,reclimit)
; see $$match^pcre and $$compile^pcre for options, locale, mlimit and
; reclimit
;
; subst is a string to replace with all occurrences of matched data
; \n (like \1, \2, ..) is a back ref for the n-th captured group
; \{name} is back ref for a named captured data
; \\ is replaced with \
;
; first is the n-th match in the subject where the substitution begins,
; 1 .. n-1 matches are not substituted
; defaults to 1
;
; last is the n-th match in the subject where the substitution ends,
; n+1 .. matches are not substituted
; defaults to 0 (no limit)
;
n ref,o,n,i,j,begin,end,offset,backref,boffset,value,s
s ref=$$compile(.pattern,,$g(locale),$g(mlimit,0),$g(reclimit,0))
;
; prepare back reference stack
d global^pcre(.subst,"\\(?:(?<ref>(?:\d+|\\))|{(?<ref>[^}]+)})",.backref,,"ovector|dupnames")
;
s options=$g(options)_"|ovector"
; silently remove "NAMED_ONLY" and "GROUPPED" options
f i=1:1:$l(options,"|") d
. s o=$zco($p(options,"|",i),"u")
. s:o="NAMED_ONLY"!(o="GROUPED") $p(options,"|",i)=""
q:'$$global(.subject,.pattern,.n,,.options,$g(locale),$g(mlimit,0),$g(reclimit,0)) subject
;
; perform the substitution on matched subject parts
s first=$g(first,1),last=$g(last,0)
s offset=0,i=""
f s i=$o(n(i)) q:i="" d:i'<first q:last>0&(i'<last)
.
. ; replace back refs in subst (s) with captured data
. s s=subst,boffset=0,j=""
. f s j=$o(backref(j)) q:j="" d
..
.. ; determine the back ref type and get the value
.. ; silently ignore invalid refs
.. s value=$s(backref(j,"ref")="\":"\\",1:$g(n(i,backref(j,"ref"))))
..
.. ; replace back ref with the value
.. s begin=backref(j,0,0)
.. s end=backref(j,0,1)
.. s $ze(s,begin+boffset,end+boffset)=value
.. s boffset=boffset-(end+1-begin)+$l(value)
.
. ; replace matched data with prepared s
. s begin=n(i,0,0)
. s end=n(i,0,1)
.
. s $ze(subject,begin+offset,end+offset)=s
.
. ; substitute empty matches also (Perl style)
. ;
. ; perl -e '$_ = "aa"; s/(b*|a)/Xy/g; print "$_\n"'
. ; w $$replace^pcre("aa","(b*|a)","Xy")
. ;
. ; perl -e '$_ = "aa"; s/(b*|aa)/Xy/g; print "$_\n"'
. ; w $$replace^pcre("aa","(b*|aa)","Xy")
. ;
. ; perl -e '$_ = "aaa"; s/(b*|aa)/Xy/g; print "$_\n"'
. ; w $$replace^pcre("aaa","(b*|aa)","Xy")
. ;
. s:begin>end $ze(subject,begin+offset,begin+offset+1)=s_$ze(subject,begin+offset,begin+offset+1)
.
. s offset=offset-(end+1-begin)+$l(s)
q:$q subject q
free(ref)
d protect
n code
s code=$&pcre.free(.ref)
s:code $ec=",U"_(-code)_","
q
stackusage()
; return the approximate amount of stack (in bytes) used per
; recursion in pcre_exec()
q -$&pcre.stackusage()
; Exception Handling
;
; Error conditions are handled by setting the $zc to user codes, see labels
; at the end of this file. When neither $zt nor $et are set by the user,
; the default handler (trap^pcre) is used within $zt mechanism.
;
; The default handler will write out the details of the exception, and
; depending on the caller type, it will re raise the exception. This will
; lead to:
; a) writing the exception details, when called from the GT.M prompt,
; b) writing the exception details, the M code place when the pcre routine
; was called, and terminating the GT.M image.
;
; The user should define own exception handler using $zt or $et, see
; pcreexample.m for example exception handlers.
;
protect ; try setup $zt with default handler
;
; "n protect" in the $zt is a marker for trap^pcre
s:'$l($et)&(($zt="B")!'$l($zt)) $zt="n protect d trap zg "_($zl-2)
q
trap(stack)
; see U* labels at the bottom of this file, some lvns are mandatory
; all exceptions are passed through if we wasn't called from direct mode
;
n zl,ref,msg,place
;
; take the $zl if in default handler setup by protect^trap
s zl=$p($zt,"n protect d trap zg ",2)
;
; clear the $zt
s $zt=""
;
; source location from either stack argument, zl (default handler), or $st-2
s place=$st($g(stack,$g(zl,$st-1)-1),"PLACE")
;
; clear location if called from direct mode
s:place["^GTM$DMOD" place=""
;
s ref=$p($ec,",",$l($ec,",")-1)
i $l($t(@ref)) d
. u $p
. w @$p($t(@ref),";",2)
. ; %PCRE-E-COMPILE additional message
. w:ref="U16392"&$g(erroffset) " in "_$e($g(pattern),1,erroffset)_" <-- HERE"
. w !
. ; write the location it has any meaning
. w:$l(place) "%PCRE-I-RTSLOC, At M source location ",place,!
e d
. w $p($zs,",",3,4),!
. w "%GTM-I-RTSLOC, At M source location ",$p($zs,",",2),!
;
; re raise the exception if in a default handler and not called from the direct mode
s:$l(place)&$g(zl,0) $ec=$ec
q
; XC API specific
;
U16384 ;"%PCRE-E-ARGSMALL, Actual argument count is too small"
U16385 ;"%PCRE-E-OPTNAME, Unknown option name "_$p($g(erropt),"|")
U16386 ;"%PCRE-E-OBJLIMIT, Maximum number of objects exceeded"
U16387 ;"%PCRE-E-INVREF, Invalid object reference"
U16388 ;"%PCRE-E-INTBUF, Internal buffer too small"
U16389 ;"%PCRE-E-MALLOC, Could not allocate memory"
U16390 ;"%PCRE-E-STUDY, Pattern study failed: "_$g(err,"unknown reason")
U16391 ;"%PCRE-E-LOCALE, Invalid locale name "_$g(locale)
U16392 ;"%PCRE-E-COMPILE, Pattern compilation failed, "_$g(err,"unknown reason")
U16393 ;"%PCRE-E-LENGTH, Invalid length value specified"
; PCRE specific
;
; NOTES:
;
; U16401 exception is never raised; when pcre_exec() returns -1
; (i.e. NOMATCH) the pcre.exec returns 0, so no exception will
; ever raise, NOMATCH is not an uncommon situation
;
; U16388 is raised when pcre_exec() returns 0, i.e. the ovector
; was too small; considering that ovector size is not controlled
; in M world, it is an exception here
;
U16401 ;"%PCRE-E-NOMATCH, The subject string did not match the pattern"
U16402 ;"%PCRE-E-NULL, Either compiled code or subject was passed as NULL, or ovector was NULL"
U16403 ;"%PCRE-E-BADOPTION, An unrecognized bit was set in the options argument"
U16404 ;"%PCRE-E-BADMAGIC, The magic number is not present in compiled code"
U16405 ;"%PCRE-E-UNKNOWNOPCODE, While running the pattern match, an unknown item was encountered in the compiled pattern"
U16406 ;"%PCRE-E-NOMEMORY, Call via pcre_malloc() or pcre_stack_malloc() failed"
U16407 ;"%PCRE-E-NOSUBSTRING, No substring"
U16408 ;"%PCRE-E-MATCHLIMIT, Match limit was reached"
U16409 ;"%PCRE-E-CALLOUT, Callout function wanted to yield a distinctive error code"
U16410 ;"%PCRE-E-BADUTF8, A string that contains an invalid UTF-8 byte sequence was passed as a subject"
U16411 ;"%PCRE-E-BADUTF8OFFSET, The value of startoffset did not point to the beginning of a UTF-8 character or the end of the subject"
U16412 ;"%PCRE-E-PARTIAL, The subject string did not match, but it did match partially"
U16414 ;"%PCRE-E-INTERNAL, An unexpected internal error has occurred"
U16415 ;"%PCRE-E-BADCOUNT, The value of the ovecsize argument is negative"
U16416 ;"%PCRE-E-DFAUITEM, Unsupported item in the pattern, for e.g. \C o a back reference"
U16417 ;"%PCRE-E-DFAUCOND, Unsupported condition item, for e.g. a back reference for a condition, or a test for recursion in a specific group"
U16418 ;"%PCRE-E-DFAUMLIMIT, Match limits are unsupported for DTA matching"
U16419 ;"%PCRE-E-DFAWSSIZE, Out of space in the workspace vector"
U16420 ;"%PCRE-E-DFARECURSE, The output vector was not large enough while processing recursive subpattern"
U16421 ;"%PCRE-E-RECURSIONLIMIT, The internal recursion limit was reached"
U16423 ;"%PCRE-E-BADNEWLINE, An invalid combination of NL_xxx options was given"
U16424 ;"%PCRE-E-BADOFFSET, The startoffset was negative or greater than the length of the value in length"
U16425 ;"%PCRE-E-SHORTUTF8, The subject string ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD option is set"
U16426 ;"%PCRE-E-RECURSELOOP, A recursion loop within the pattern was detected"
U16427 ;"%PCRE-E-JITSTACKLIMIT, The memory available for the just-in-time processing stack is not large enough"