; ; GT.M PCRE Extension ; Copyright (C) 2012 Piotr Koper ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU Affero General Public License as ; published by the Free Software Foundation, either version 3 of the ; License, or (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU Affero General Public License for more details. ; ; You should have received a copy of the GNU Affero General Public License ; along with this program. If not, see . ; ; GT.M™ is a trademark of Fidelity Information Services, Inc. ; "GT.M™ is a vetted industrial strength, transaction processing application ; platform consisting of a key-value database engine optimized for extreme ; transaction processing throughput & business continuity." ; -- http://sourceforge.net/projects/fis-gtm/ ; GT.M PCRE Extension ; ; This extension tries to deliver the best possible PCRE interface for the ; M world by providing a support for PCRE with M arrays, stringified parameter ; names, simplified API, locales, exceptions and Perl5 Global Match. ; ; See pcreexamples.m for comprehensive examples on ^pcre routines usage and ; beginner level tips on match limits, exception handling and UTF-8 in GT.M. ; ; Try out the best known book on regular expressions: http://regex.info/ ; For more information on PCRE see: http://pcre.org/ ; ; Please feel free to contact me if you have any questions or comments, ; Piotr Koper ; pcre ;GT.M PCRE Extension ;1.0;Initial release;pkoper q version() q $&pcre.version() config(name) ; name is one of: (case insensitive) ; "UTF8", "NEWLINE", "LINK_SIZE", "POSIX_MALLOC_THRESHOLD", ; "MATCH_LIMIT", "MATCH_LIMIT_RECURSION", "STACKRECURSE", ; "BSR", "UNICODE_PROPERTIES", "JIT", "JITTARGET" ; d protect ; n erropt,isstring,s,n,code s code=$&pcre.config(.name,.erropt,.isstring,.s,.n) s:code $ec=",U"_(-code)_"," q $s(isstring:s,1:n) compile(pattern,options,locale,mlimit,reclimit) ; options is case insensitive and optional string with "|" joined: ; "ANCHORED", "CASELESS", "DOLLAR_ENDONLY", "DOTALL", "EXTENDED", ; "FIRSTLINE", "MULTILINE", "NO_AUTO_CAPTURE", "DUPNAMES", ; "UNGREEDY", "BSR_ANYCRLF", "BSR_UNICODE", "JAVASCRIPT_COMPAT", ; "NL_ANY", "NL_ANYCRLF", "NL_CR", "NL_CRLF","NL_LF", ; "UTF8", "UCP", "NO_UTF8_CHECK" ; ; locale is an optional Unix locale name used for pcre_maketables(), ; cases: ; undefined or "": ; pcre_maketables() will not be called ; "ENV" (case insensitive): ; use locale in program environment defined by the ; environment variables LANG or LC_* ; specified: ; "pl_PL.iso88592", "pl_PL.utf8", "C", ... ; see locale(1), locale(2) and the output of command: ; $ locale -a ; Debian tip: use ; $ dpkg-reconfigure locales ; to enable or set system-wide locale ; ; mlimit (optional) limits the number of internal matching function ; calls in pcre_exec() execution, see PCRE manual for details ; ; reclimit (optional) limit for the depth of recursion when calling ; the internal matching function in a pcre_exec() execution, ; see PCRE manual for details ; d protect ; n erropt,ref,err,erroffset,code s code=$&pcre.compile(.pattern,$g(options),.erropt,.ref,.err,.erroffset,$g(locale),$g(mlimit,0),$g(reclimit,0)) s:code $ec=",U"_(-code)_"," q ref exec(ref,subject,options,startoffset,length) ; options is case insensitive and optional string with "|" joined: ; "ANCHORED", "BSR_ANYCRLF", "BSR_UNICODE", ; "NL_ANY", "NL_ANYCRLF", "NL_CR", "NL_CRLF", "NL_LF", ; "NOTBOL", "NOTEOL", "NOTEMPTY", "NOTEMPTY_ATSTART", ; "NO_START_OPTIMIZE", "NO_UTF8_CHECK", ; "PARTIAL_SOFT", "PARTIAL_HARD" ; ; startoffset is in octets, starts with 1 (like in M) (optional) ; ; length is subject length in octets, not chars (optional) ; d protect ; n erropt,code,start s start=$g(startoffset,1)-1 s code=$&pcre.exec(.ref,.subject,$g(length,$zl(subject)),start,$g(options),.erropt) s:code<0 $ec=",U"_(-code)_"," q code ovector(ref,i) ; return i-element from ovector d protect ; n n,code s code=$&pcre.ovector(.ref,.i,.n) s:code $ec=",U"_(-code)_"," ;s $ec=",U123," q n ovecsize(ref) ; return ovecsize d protect ; n n,code s code=$&pcre.ovecsize(.ref,.n) s:code $ec=",U"_(-code)_"," q n fullinfo(ref,name) ; name is one of: (case insensitive) ; "OPTIONS", "SIZE", "CAPTURECOUNT", "BACKREFMAX", "FIRSTBYTE", ; "FIRSTTABLE", "LASTLITERAL", "NAMEENTRYSIZE", "NAMECOUNT", ; "STUDYSIZE", "OKPARTIAL", "JCHANGED", "HASCRORLF", "MINLENGTH", ; "JIT", "JITSIZE" ; for NAME* options see also $$nametable^pcre() ; d protect ; n erropt,isstring,s,n,code s code=$&pcre.fullinfo(.ref,.name,.erropt,.isstring,.s,.n) s:code $ec=",U"_(-code)_"," q $s(isstring:s,1:n) nametable(ref,i,n) ; returns index (n) and name, or { 0, "" } for invalid i ; i is indexed from 1 ; d protect ; n s,code s code=$&pcre.nametable(.ref,.i,.n,.s) s:code $ec=",U"_(-code)_"," q s substring(ref,i,begin,end) s begin=$$ovector(.ref,i*2)+1,end=$$ovector(.ref,i*2+1) ; ovector contains octet indexed data not UNICODE chars, so $ze is used q:'begin "" q $s($g(o,0):begin_","_end,1:$ze(subject,begin,end)) store(ref,i,n,o,key) ; same as above but stores captured data in n array n begin,end s begin=$$ovector(.ref,i*2)+1,end=$$ovector(.ref,i*2+1) q:'begin s key=$g(key,i) s:o n(key,0)=begin,n(key,1)=end s n(key)=$ze(subject,begin,end) q gstore(ref,i,n,round,byref,o,key) ; store for global match n begin,end s begin=$$ovector(.ref,i*2)+1,end=$$ovector(.ref,i*2+1) q:'begin s key=$g(key,i) i byref d . s:o n(key,round,0)=begin,n(key,round,1)=end . s n(key,round)=$ze(subject,begin,end) e d . s:o n(round,key,0)=begin,n(round,key,1)=end . s n(round,key)=$ze(subject,begin,end) q test(subject,pattern,options,locale,mlimit,reclimit) ; see $$compile^pcre for options, locale, mlimit and reclimit ; d protect n ref,l s ref=$$compile(.pattern,$g(options),$g(locale),$g(mlimit,0),$g(reclimit,0)) s l=$$exec(.ref,.subject) d free(.ref) q l match(subject,pattern,match,capture,options,locale,mlimit,reclimit) ; see $$compile^pcre for options, locale, mlimit and reclimit ; ; capture is case insensitive and optional string with "|" joined ; names or indexes to be capture ; ; extended options: ; "NAMED_ONLY" - capture only named groups ; "OVECTOR" - return additional ovector data ; d protect ; n namedonly,ovector,ref,o,l,i,j,s,c,begin ; s options=$g(options),(namedonly,ovector)=0 f i=1:1:$l(options,"|") d . s o=$zco($p(options,"|",i),"u") . i o="NAMED_ONLY" s namedonly=1,$p(options,"|",i)="" . i o="OVECTOR" s ovector=1,$p(options,"|",i)="" s:namedonly options=options_"|NO_AUTO_CAPTURE" ; k match s ref=$$compile(.pattern,.options,$g(locale),$g(mlimit,0),$g(reclimit,0)) s l=$$exec(.ref,.subject) i $d(capture) d . s c="|"_capture_"|" . ; ovector indexed data . i 'namedonly f i=0:1:l-1 d:c[("|"_i_"|") store(.ref,.i,.match,.ovector) . ; named matches data . f i=1:1 s s=$$nametable(.ref,.i,.j) q:s="" d:c[("|"_s_"|") store(.ref,.j,.match,.ovector,.s) e d . i 'namedonly f i=0:1:l-1 d store(.ref,.i,.match,.ovector) . f i=1:1 s s=$$nametable(.ref,.i,.j) q:s="" d store(.ref,.j,.match,.ovector,.s) d free(.ref) q:$q l q global(subject,pattern,match,capture,options,locale,mlimit,reclimit) ; options is the same as for match^pcre, extended options: ; "OVECTOR" - return additional ovector data ; "GROUPED" - group the result in match array by pattern groups ; "NAMED_ONLY" - capture only named patterns ; ; see pcredemo.c and pcreccp.cc from PCRE for comments on procedure ; for Perl like global matching ; d protect ; n ref,c,o,ovector,byref,namedonly,utf8,crlf,start,end,matches,empty,skip,round,i,j,s,n,q k match ; ; determine additional options and remove them before calling the compile^pcre s options=$g(options),(ovector,byref,namedonly)=0 f i=1:1:$l(options,"|") d . s o=$zco($p(options,"|",i),"u") . i o="NAMED_ONLY" s namedonly=1,$p(options,"|",i)="" . i o="GROUPED" s byref=1,$p(options,"|",i)="" . i o="OVECTOR" s ovector=1,$p(options,"|",i)="" s:namedonly options=options_"|NO_AUTO_CAPTURE" ; ; compile the pattern s ref=$$compile(.pattern,.options,$g(locale),$g(mlimit,0),$g(reclimit,0)) ; s:$d(capture) c="|"_capture_"|" s byref=$g(byref,0) ; ; check pattern options for UTF8 and double char new line s o="|"_$$fullinfo(.ref,"OPTIONS")_"|" s utf8=$s(o["|UTF8|":1,1:0) s crlf=$s(o["|NL_CRLF|":1,o["|NL_ANY|":1,o["|NL_ANYCRLF|":1,1:0) ; ; if none check the PCRE build options i crlf=0 d . s o=$$config("NEWLINE") . s crlf=$s(o="NL_CRLF":1,o="NL_ANY":1,o="NL_ANYCRLF":1,1:0) ; s (start,round,i)=1,(empty,skip,q)=0 s end=$l(subject)+1 f d q:start>end!q . i empty d .. s matches=$$exec(.ref,.subject,"NOTEMPTY_ATSTART|ANCHORED",.start) ; unwind this call to optimize .. q:matches ; quit this do, leave empty=1, store the matches .. .. ; advance if no match & clear empty .. s start=start+1 .. i start>end s q=1 q .. .. ; skip LF if CR was before and CRLF mode .. s:crlf&(($ze(subject,start-1)=$c(13))&($ze(subject,start)=$c(10))) start=start+1 .. .. ; skip if in a middle of UTF char .. i utf8 f q:start'(?:\d+|\\))|{(?[^}]+)})",.backref,,"ovector|dupnames") ; s options=$g(options)_"|ovector" ; silently remove "NAMED_ONLY" and "GROUPPED" options f i=1:1:$l(options,"|") d . s o=$zco($p(options,"|",i),"u") . s:o="NAMED_ONLY"!(o="GROUPED") $p(options,"|",i)="" q:'$$global(.subject,.pattern,.n,,.options,$g(locale),$g(mlimit,0),$g(reclimit,0)) subject ; ; perform the substitution on matched subject parts s first=$g(first,1),last=$g(last,0) s offset=0,i="" f s i=$o(n(i)) q:i="" d:i'0&(i'end $ze(subject,begin+offset,begin+offset+1)=s_$ze(subject,begin+offset,begin+offset+1) . . s offset=offset-(end+1-begin)+$l(s) q:$q subject q free(ref) d protect n code s code=$&pcre.free(.ref) s:code $ec=",U"_(-code)_"," q stackusage() ; return the approximate amount of stack (in bytes) used per ; recursion in pcre_exec() q -$&pcre.stackusage() ; Exception Handling ; ; Error conditions are handled by setting the $zc to user codes, see labels ; at the end of this file. When neither $zt nor $et are set by the user, ; the default handler (trap^pcre) is used within $zt mechanism. ; ; The default handler will write out the details of the exception, and ; depending on the caller type, it will re raise the exception. This will ; lead to: ; a) writing the exception details, when called from the GT.M prompt, ; b) writing the exception details, the M code place when the pcre routine ; was called, and terminating the GT.M image. ; ; The user should define own exception handler using $zt or $et, see ; pcreexample.m for example exception handlers. ; protect ; try setup $zt with default handler ; ; "n protect" in the $zt is a marker for trap^pcre s:'$l($et)&(($zt="B")!'$l($zt)) $zt="n protect d trap zg "_($zl-2) q trap(stack) ; see U* labels at the bottom of this file, some lvns are mandatory ; all exceptions are passed through if we wasn't called from direct mode ; n zl,ref,msg,place ; ; take the $zl if in default handler setup by protect^trap s zl=$p($zt,"n protect d trap zg ",2) ; ; clear the $zt s $zt="" ; ; source location from either stack argument, zl (default handler), or $st-2 s place=$st($g(stack,$g(zl,$st-1)-1),"PLACE") ; ; clear location if called from direct mode s:place["^GTM$DMOD" place="" ; s ref=$p($ec,",",$l($ec,",")-1) i $l($t(@ref)) d . u $p . w @$p($t(@ref),";",2) . ; %PCRE-E-COMPILE additional message . w:ref="U16392"&$g(erroffset) " in "_$e($g(pattern),1,erroffset)_" <-- HERE" . w ! . ; write the location it has any meaning . w:$l(place) "%PCRE-I-RTSLOC, At M source location ",place,! e d . w $p($zs,",",3,4),! . w "%GTM-I-RTSLOC, At M source location ",$p($zs,",",2),! ; ; re raise the exception if in a default handler and not called from the direct mode s:$l(place)&$g(zl,0) $ec=$ec q ; XC API specific ; U16384 ;"%PCRE-E-ARGSMALL, Actual argument count is too small" U16385 ;"%PCRE-E-OPTNAME, Unknown option name "_$p($g(erropt),"|") U16386 ;"%PCRE-E-OBJLIMIT, Maximum number of objects exceeded" U16387 ;"%PCRE-E-INVREF, Invalid object reference" U16388 ;"%PCRE-E-INTBUF, Internal buffer too small" U16389 ;"%PCRE-E-MALLOC, Could not allocate memory" U16390 ;"%PCRE-E-STUDY, Pattern study failed: "_$g(err,"unknown reason") U16391 ;"%PCRE-E-LOCALE, Invalid locale name "_$g(locale) U16392 ;"%PCRE-E-COMPILE, Pattern compilation failed, "_$g(err,"unknown reason") U16393 ;"%PCRE-E-LENGTH, Invalid length value specified" ; PCRE specific ; ; NOTES: ; ; U16401 exception is never raised; when pcre_exec() returns -1 ; (i.e. NOMATCH) the pcre.exec returns 0, so no exception will ; ever raise, NOMATCH is not an uncommon situation ; ; U16388 is raised when pcre_exec() returns 0, i.e. the ovector ; was too small; considering that ovector size is not controlled ; in M world, it is an exception here ; U16401 ;"%PCRE-E-NOMATCH, The subject string did not match the pattern" U16402 ;"%PCRE-E-NULL, Either compiled code or subject was passed as NULL, or ovector was NULL" U16403 ;"%PCRE-E-BADOPTION, An unrecognized bit was set in the options argument" U16404 ;"%PCRE-E-BADMAGIC, The magic number is not present in compiled code" U16405 ;"%PCRE-E-UNKNOWNOPCODE, While running the pattern match, an unknown item was encountered in the compiled pattern" U16406 ;"%PCRE-E-NOMEMORY, Call via pcre_malloc() or pcre_stack_malloc() failed" U16407 ;"%PCRE-E-NOSUBSTRING, No substring" U16408 ;"%PCRE-E-MATCHLIMIT, Match limit was reached" U16409 ;"%PCRE-E-CALLOUT, Callout function wanted to yield a distinctive error code" U16410 ;"%PCRE-E-BADUTF8, A string that contains an invalid UTF-8 byte sequence was passed as a subject" U16411 ;"%PCRE-E-BADUTF8OFFSET, The value of startoffset did not point to the beginning of a UTF-8 character or the end of the subject" U16412 ;"%PCRE-E-PARTIAL, The subject string did not match, but it did match partially" U16414 ;"%PCRE-E-INTERNAL, An unexpected internal error has occurred" U16415 ;"%PCRE-E-BADCOUNT, The value of the ovecsize argument is negative" U16416 ;"%PCRE-E-DFAUITEM, Unsupported item in the pattern, for e.g. \C o a back reference" U16417 ;"%PCRE-E-DFAUCOND, Unsupported condition item, for e.g. a back reference for a condition, or a test for recursion in a specific group" U16418 ;"%PCRE-E-DFAUMLIMIT, Match limits are unsupported for DTA matching" U16419 ;"%PCRE-E-DFAWSSIZE, Out of space in the workspace vector" U16420 ;"%PCRE-E-DFARECURSE, The output vector was not large enough while processing recursive subpattern" U16421 ;"%PCRE-E-RECURSIONLIMIT, The internal recursion limit was reached" U16423 ;"%PCRE-E-BADNEWLINE, An invalid combination of NL_xxx options was given" U16424 ;"%PCRE-E-BADOFFSET, The startoffset was negative or greater than the length of the value in length" U16425 ;"%PCRE-E-SHORTUTF8, The subject string ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD option is set" U16426 ;"%PCRE-E-RECURSELOOP, A recursion loop within the pattern was detected" U16427 ;"%PCRE-E-JITSTACKLIMIT, The memory available for the just-in-time processing stack is not large enough"