regex.h

Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2008, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 //#define REGEX_DEBUG
00020 
00045 #include "unicode/utypes.h"
00046 
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048 
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/parseerr.h"
00052 
00053 #include "unicode/uregex.h"
00054 
00055 U_NAMESPACE_BEGIN
00056 
00057 
00058 // Forward Declarations...
00059 
00060 class RegexMatcher;
00061 class RegexPattern;
00062 class UVector;
00063 class UVector32;
00064 class UnicodeSet;
00065 struct REStackFrame;
00066 struct Regex8BitSet;
00067 class  RuleBasedBreakIterator;
00068 class  RegexCImpl;
00069 
00070 
00071 
00072 
00077 #ifdef REGEX_DEBUG
00078 U_INTERNAL void U_EXPORT2
00079     RegexPatternDump(const RegexPattern *pat);
00080 #else
00081     #define RegexPatternDump(pat)
00082 #endif
00083 
00084 
00085 
00097 class U_I18N_API RegexPattern: public UObject {
00098 public:
00099 
00107     RegexPattern();
00108 
00115     RegexPattern(const RegexPattern &source);
00116 
00122     virtual ~RegexPattern();
00123 
00132     UBool           operator==(const RegexPattern& that) const;
00133 
00142     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00143 
00149     RegexPattern  &operator =(const RegexPattern &source);
00150 
00158     virtual RegexPattern  *clone() const;
00159 
00160 
00185     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00186         UParseError          &pe,
00187         UErrorCode           &status);
00188 
00213     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00214         uint32_t             flags,
00215         UParseError          &pe,
00216         UErrorCode           &status);
00217 
00218 
00241     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00242         uint32_t             flags,
00243         UErrorCode           &status);
00244 
00245 
00251     virtual uint32_t flags() const;
00252 
00270     virtual RegexMatcher *matcher(const UnicodeString &input,
00271         UErrorCode          &status) const;
00272 
00273 private:
00285     RegexMatcher *matcher(const UChar *input,
00286         UErrorCode          &status) const;
00287 public:
00288 
00289 
00301     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00302 
00303 
00318     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
00319         const UnicodeString   &input,
00320         UParseError     &pe,
00321         UErrorCode      &status);
00322 
00323 
00328     virtual UnicodeString pattern() const;
00329 
00330 
00356     virtual int32_t  split(const UnicodeString &input,
00357         UnicodeString    dest[],
00358         int32_t          destCapacity,
00359         UErrorCode       &status) const;
00360 
00361 
00367     virtual UClassID getDynamicClassID() const;
00368 
00374     static UClassID U_EXPORT2 getStaticClassID();
00375 
00376 private:
00377     //
00378     //  Implementation Data
00379     //
00380     UnicodeString   fPattern;      // The original pattern string.
00381     uint32_t        fFlags;        // The flags used when compiling the pattern.
00382                                    //
00383     UVector32       *fCompiledPat; // The compiled pattern p-code.
00384     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00385                                    //   after un-escaping, for use during the match.
00386 
00387     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00388     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00389 
00390 
00391     UErrorCode      fDeferredStatus; // status if some prior error has left this
00392                                    //  RegexPattern in an unusable state.
00393 
00394     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00395                                    //   >= this value.  For some patterns, this calculated
00396                                    //   value may be less than the true shortest
00397                                    //   possible match.
00398 
00399     int32_t         fFrameSize;    // Size of a state stack frame in the
00400                                    //   execution engine.
00401 
00402     int32_t         fDataSize;     // The size of the data needed by the pattern that
00403                                    //   does not go on the state stack, but has just
00404                                    //   a single copy per matcher.
00405 
00406     UVector32       *fGroupMap;    // Map from capture group number to position of
00407                                    //   the group's variables in the matcher stack frame.
00408 
00409     int32_t         fMaxCaptureDigits;
00410 
00411     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00412                                    //   regex character classes, e.g. Word.
00413 
00414     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00415                                    //  sets for predefined regex classes.
00416 
00417     int32_t         fStartType;    // Info on how a match must start.
00418     int32_t         fInitialStringIdx;     //
00419     int32_t         fInitialStringLen;
00420     UnicodeSet     *fInitialChars;
00421     UChar32         fInitialChar;
00422     Regex8BitSet   *fInitialChars8;
00423 
00424     friend class RegexCompile;
00425     friend class RegexMatcher;
00426     friend class RegexCImpl;
00427 
00428     //
00429     //  Implementation Methods
00430     //
00431     void        init();            // Common initialization, for use by constructors.
00432     void        zap();             // Common cleanup
00433 #ifdef REGEX_DEBUG
00434     void        dumpOp(int32_t index) const;
00435     friend     void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00436 #endif
00437 
00438 };
00439 
00440 
00441 
00451 class U_I18N_API RegexMatcher: public UObject {
00452 public:
00453 
00468     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00469 
00491     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00492         uint32_t flags, UErrorCode &status);
00493 
00494 private:
00506     RegexMatcher(const UnicodeString &regexp, const UChar *input,
00507         uint32_t flags, UErrorCode &status);
00508 public:
00509 
00510 
00516     virtual ~RegexMatcher();
00517 
00518 
00525     virtual UBool matches(UErrorCode &status);
00526 
00537     virtual UBool matches(int32_t startIndex, UErrorCode &status);
00538 
00539 
00540 
00541 
00555     virtual UBool lookingAt(UErrorCode &status);
00556 
00557 
00571     virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
00572 
00585     virtual UBool find();
00586 
00587 
00597     virtual UBool find(int32_t start, UErrorCode &status);
00598 
00599 
00609     virtual UnicodeString group(UErrorCode &status) const;
00610 
00611 
00624     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00625 
00626 
00632     virtual int32_t groupCount() const;
00633 
00634 
00642     virtual int32_t start(UErrorCode &status) const;
00643 
00644 
00658     virtual int32_t start(int32_t group, UErrorCode &status) const;
00659 
00660 
00670     virtual int32_t end(UErrorCode &status) const;
00671 
00672 
00686     virtual int32_t end(int32_t group, UErrorCode &status) const;
00687 
00688 
00697     virtual RegexMatcher &reset();
00698 
00699 
00715     virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
00716 
00717 
00731     virtual RegexMatcher &reset(const UnicodeString &input);
00732 
00733 private:
00745     RegexMatcher &reset(const UChar *input);
00746 public:
00747 
00754     virtual const UnicodeString &input() const;
00755     
00756     
00757 
00776      virtual RegexMatcher &region(int32_t start, int32_t limit, UErrorCode &status);
00777 
00778 
00787      virtual int32_t regionStart() const;
00788 
00789 
00798       virtual int32_t regionEnd() const;
00799 
00808       virtual UBool hasTransparentBounds() const;
00809 
00828       virtual RegexMatcher &useTransparentBounds(UBool b);
00829 
00830      
00838       virtual UBool hasAnchoringBounds() const;
00839 
00852       virtual RegexMatcher &useAnchoringBounds(UBool b);
00853 
00866       virtual UBool hitEnd() const;
00867 
00877       virtual UBool requireEnd() const;
00878 
00879 
00880 
00881 
00882 
00888     virtual const RegexPattern &pattern() const;
00889 
00890 
00907     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
00908 
00909 
00930     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
00931 
00959     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
00960         const UnicodeString &replacement, UErrorCode &status);
00961 
00962 
00973     virtual UnicodeString &appendTail(UnicodeString &dest);
00974 
00975 
00976 
01001     virtual int32_t  split(const UnicodeString &input,
01002         UnicodeString    dest[],
01003         int32_t          destCapacity,
01004         UErrorCode       &status);
01005 
01027     virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01028 
01035     virtual int32_t getTimeLimit() const;
01036 
01058     virtual void setStackLimit(int32_t  limit, UErrorCode &status);
01059     
01067     virtual int32_t  getStackLimit() const;
01068 
01069 
01083     virtual void setMatchCallback(URegexMatchCallback     *callback,
01084                                   const void              *context,
01085                                   UErrorCode              &status);
01086 
01087 
01088 
01099     virtual void getMatchCallback(URegexMatchCallback     *&callback,
01100                                   const void              *&context,
01101                                   UErrorCode              &status);
01102 
01103 
01109     void setTrace(UBool state);
01110 
01111 
01117     static UClassID U_EXPORT2 getStaticClassID();
01118 
01124     virtual UClassID getDynamicClassID() const;
01125 
01126 private:
01127     // Constructors and other object boilerplate are private.
01128     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
01129     RegexMatcher();                  // default constructor not implemented
01130     RegexMatcher(const RegexPattern *pat);
01131     RegexMatcher(const RegexMatcher &other);
01132     RegexMatcher &operator =(const RegexMatcher &rhs);
01133     void init(UErrorCode &status);                      // Common initialization
01134     void init2(const UnicodeString &s, UErrorCode &e);  // Common initialization, part 2.
01135 
01136     friend class RegexPattern;
01137     friend class RegexCImpl;
01138 public:
01140     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
01141 private:
01142 
01143     //
01144     //  MatchAt   This is the internal interface to the match engine itself.
01145     //            Match status comes back in matcher member variables.
01146     //
01147     void                 MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01148     inline void          backTrack(int32_t &inputIdx, int32_t &patIdx);
01149     UBool                isWordBoundary(int32_t pos);         // perform Perl-like  \b test
01150     UBool                isUWordBoundary(int32_t pos);        // perform RBBI based \b test
01151     REStackFrame        *resetStack();
01152     inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status);
01153     void                 IncrementTime(UErrorCode &status);
01154 
01155 
01156     const RegexPattern  *fPattern;
01157     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
01158                                            //   should delete it when through.
01159 
01160     const UnicodeString *fInput;           // The text being matched. Is never NULL.
01161     int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
01162     
01163     int32_t              fRegionStart;     // Start of the input region, default = 0.
01164     int32_t              fRegionLimit;     // End of input region, default to input.length.
01165     
01166     int32_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
01167     int32_t              fAnchorLimit;     //   See useAnchoringBounds
01168     
01169     int32_t              fLookStart;       // Region bounds for look-ahead/behind and
01170     int32_t              fLookLimit;       //   and other boundary tests.  See
01171                                            //   useTransparentBounds
01172 
01173     int32_t              fActiveStart;     // Currently active bounds for matching.
01174     int32_t              fActiveLimit;     //   Usually is the same as region, but
01175                                            //   is changed to fLookStart/Limit when
01176                                            //   entering look around regions.
01177 
01178     UBool                fTransparentBounds;  // True if using transparent bounds.
01179     UBool                fAnchoringBounds; // True if using anchoring bounds.
01180 
01181     UBool                fMatch;           // True if the last attempted match was successful.
01182     int32_t              fMatchStart;      // Position of the start of the most recent match
01183     int32_t              fMatchEnd;        // First position after the end of the most recent match
01184                                            //   Zero if no previous match, even when a region
01185                                            //   is active.
01186     int32_t              fLastMatchEnd;    // First position after the end of the previous match,
01187                                            //   or -1 if there was no previous match.
01188     int32_t              fAppendPosition;  // First position after the end of the previous
01189                                            //   appendReplacement().  As described by the
01190                                            //   JavaDoc for Java Matcher, where it is called 
01191                                            //   "append position"
01192     UBool                fHitEnd;          // True if the last match touched the end of input.
01193     UBool                fRequireEnd;      // True if the last match required end-of-input
01194                                            //    (matched $ or Z)
01195 
01196     UVector32           *fStack;
01197     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
01198                                            //   which will contain the capture group results.
01199                                            //   NOT valid while match engine is running.
01200 
01201     int32_t             *fData;            // Data area for use by the compiled pattern.
01202     int32_t             fSmallData[8];     //   Use this for data if it's enough.
01203 
01204     int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
01205                                            //   match engine run.  Zero for unlimited.
01206     
01207     int32_t             fTime;             // Match time, accumulates while matching.
01208     int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
01209                                            //   Kept separately from fTime to keep as much
01210                                            //   code as possible out of the inline
01211                                            //   StateSave function.
01212 
01213     int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
01214                                            //   stack, in bytes.  Zero for unlimited.
01215 
01216     URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
01217                                            //   NULL if there is no callback.
01218     const void         *fCallbackContext;  // User Context ptr for callback function.
01219 
01220     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
01221 
01222     UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
01223                                            //   reported, or that permanently disables this matcher.
01224 
01225     RuleBasedBreakIterator  *fWordBreakItr;
01226 
01227 
01228 };
01229 
01230 U_NAMESPACE_END
01231 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
01232 #endif

Generated on Thu Jan 15 11:25:48 2009 for ICU 4.0.1 by  doxygen 1.4.7