Frames | No Frames |
1: /* gnu/regexp/REMatch.java 2: Copyright (C) 2006 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package gnu.java.util.regex; 40: import java.io.Serializable; 41: 42: /** 43: * An instance of this class represents a match 44: * completed by a gnu.regexp matching function. It can be used 45: * to obtain relevant information about the location of a match 46: * or submatch. 47: * 48: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> 49: */ 50: public final class REMatch implements Serializable, Cloneable { 51: private String matchedText; 52: private CharIndexed matchedCharIndexed; 53: 54: // These variables are package scope for fast access within the engine 55: int eflags; // execution flags this match was made using 56: 57: // Offset in source text where match was tried. This is zero-based; 58: // the actual position in the source text is given by (offset + anchor). 59: int offset; 60: 61: // Anchor position refers to the index into the source input 62: // at which the matching operation began. 63: // This is also useful for the ANCHORINDEX option. 64: int anchor; 65: 66: // Package scope; used by RE. 67: int index; // used while matching to mark current match position in input 68: // start1[i] is set when the i-th subexp starts. And start1[i] is copied 69: // to start[i] when the i-th subexp ends. So start[i] keeps the previously 70: // assigned value while the i-th subexp is being processed. This makes 71: // backreference to the i-th subexp within the i-th subexp possible. 72: int[] start; // start positions (relative to offset) for each (sub)exp. 73: int[] start1; // start positions (relative to offset) for each (sub)exp. 74: int[] end; // end positions for the same 75: // start[i] == -1 or end[i] == -1 means that the start/end position is void. 76: // start[i] == p or end[i] == p where p < 0 and p != -1 means that 77: // the actual start/end position is (p+1). Start/end positions may 78: // become negative when the subexpression is in a RETokenLookBehind. 79: boolean empty; // empty string matched. This flag is used only within 80: // RETokenRepeated. 81: 82: BacktrackStack backtrackStack; 83: 84: public Object clone() { 85: try { 86: REMatch copy = (REMatch) super.clone(); 87: 88: copy.start = (int[]) start.clone(); 89: copy.start1 = (int[]) start1.clone(); 90: copy.end = (int[]) end.clone(); 91: 92: return copy; 93: } catch (CloneNotSupportedException e) { 94: throw new Error(); // doesn't happen 95: } 96: } 97: 98: void assignFrom(REMatch other) { 99: start = other.start; 100: start1 = other.start1; 101: end = other.end; 102: index = other.index; 103: backtrackStack = other.backtrackStack; 104: } 105: 106: REMatch(int subs, int anchor, int eflags) { 107: start = new int[subs+1]; 108: start1 = new int[subs+1]; 109: end = new int[subs+1]; 110: this.anchor = anchor; 111: this.eflags = eflags; 112: clear(anchor); 113: } 114: 115: void finish(CharIndexed text) { 116: start[0] = 0; 117: StringBuffer sb = new StringBuffer(); 118: int i; 119: for (i = 0; i < end[0]; i++) 120: sb.append(text.charAt(i)); 121: matchedText = sb.toString(); 122: matchedCharIndexed = text; 123: for (i = 0; i < start.length; i++) { 124: // If any subexpressions didn't terminate, they don't count 125: // TODO check if this code ever gets hit 126: if ((start[i] == -1) ^ (end[i] == -1)) { 127: start[i] = -1; 128: end[i] = -1; 129: } 130: } 131: backtrackStack = null; 132: } 133: 134: /** Clears the current match and moves the offset to the new index. */ 135: void clear(int index) { 136: offset = index; 137: this.index = 0; 138: for (int i = 0; i < start.length; i++) { 139: start[i] = start1[i] = end[i] = -1; 140: } 141: backtrackStack = null; 142: } 143: 144: /** 145: * Returns the string matching the pattern. This makes it convenient 146: * to write code like the following: 147: * <P> 148: * <code> 149: * REMatch myMatch = myExpression.getMatch(myString);<br> 150: * if (myMatch != null) System.out.println("Regexp found: "+myMatch); 151: * </code> 152: */ 153: public String toString() { 154: return matchedText; 155: } 156: 157: /** 158: * Returns the index within the input text where the match in its entirety 159: * began. 160: */ 161: public int getStartIndex() { 162: return offset + start[0]; 163: } 164: 165: /** 166: * Returns the index within the input string where the match in 167: * its entirety ends. The return value is the next position after 168: * the end of the string; therefore, a match created by the 169: * following call: 170: * 171: * <P> 172: * <code>REMatch myMatch = myExpression.getMatch(myString);</code> 173: * <P> 174: * can be viewed (given that myMatch is not null) by creating 175: * <P> 176: * <code>String theMatch = myString.substring(myMatch.getStartIndex(), 177: * myMatch.getEndIndex());</code> 178: * <P> 179: * But you can save yourself that work, since the <code>toString()</code> 180: * method (above) does exactly that for you. 181: */ 182: public int getEndIndex() { 183: return offset + end[0]; 184: } 185: 186: /** 187: * Returns the string matching the given subexpression. The subexpressions 188: * are indexed starting with one, not zero. That is, the subexpression 189: * identified by the first set of parentheses in a regular expression 190: * could be retrieved from an REMatch by calling match.toString(1). 191: * 192: * @param sub Index of the subexpression. 193: */ 194: public String toString(int sub) { 195: if ((sub >= start.length) || sub < 0) 196: throw new IndexOutOfBoundsException("No group " + sub); 197: if (start[sub] == -1) return null; 198: if (start[sub] >= 0 && end[sub] <= matchedText.length()) 199: return (matchedText.substring(start[sub],end[sub])); 200: else { 201: // This case occurs with RETokenLookAhead or RETokenLookBehind. 202: StringBuffer sb = new StringBuffer(); 203: int s = start[sub]; 204: int e = end[sub]; 205: if (s < 0) s += 1; 206: if (e < 0) e += 1; 207: for (int i = start[0] + s; i < start[0] + e; i++) 208: sb.append(matchedCharIndexed.charAt(i)); 209: return sb.toString(); 210: } 211: } 212: 213: /** 214: * Returns the index within the input string used to generate this match 215: * where subexpression number <i>sub</i> begins, or <code>-1</code> if 216: * the subexpression does not exist. The initial position is zero. 217: * 218: * @param sub Subexpression index 219: * @deprecated Use getStartIndex(int) instead. 220: */ 221: public int getSubStartIndex(int sub) { 222: if (sub >= start.length) return -1; 223: int x = start[sub]; 224: return (x == -1) ? x : 225: (x >= 0) ? offset + x : offset + x + 1; 226: } 227: 228: /** 229: * Returns the index within the input string used to generate this match 230: * where subexpression number <i>sub</i> begins, or <code>-1</code> if 231: * the subexpression does not exist. The initial position is zero. 232: * 233: * @param sub Subexpression index 234: * @since gnu.regexp 1.1.0 235: */ 236: public int getStartIndex(int sub) { 237: if (sub >= start.length) return -1; 238: int x = start[sub]; 239: return (x == -1) ? x : 240: (x >= 0) ? offset + x : offset + x + 1; 241: } 242: 243: /** 244: * Returns the index within the input string used to generate this match 245: * where subexpression number <i>sub</i> ends, or <code>-1</code> if 246: * the subexpression does not exist. The initial position is zero. 247: * 248: * @param sub Subexpression index 249: * @deprecated Use getEndIndex(int) instead 250: */ 251: public int getSubEndIndex(int sub) { 252: if (sub >= start.length) return -1; 253: int x = end[sub]; 254: return (x == -1) ? x : 255: (x >= 0) ? offset + x : offset + x + 1; 256: } 257: 258: /** 259: * Returns the index within the input string used to generate this match 260: * where subexpression number <i>sub</i> ends, or <code>-1</code> if 261: * the subexpression does not exist. The initial position is zero. 262: * 263: * @param sub Subexpression index 264: */ 265: public int getEndIndex(int sub) { 266: if (sub >= start.length) return -1; 267: int x = end[sub]; 268: return (x == -1) ? x : 269: (x >= 0) ? offset + x : offset + x + 1; 270: } 271: 272: /** 273: * Substitute the results of this match to create a new string. 274: * This is patterned after PERL, so the tokens to watch out for are 275: * <code>$0</code> through <code>$9</code>. <code>$0</code> matches 276: * the full substring matched; <code>$<i>n</i></code> matches 277: * subexpression number <i>n</i>. 278: * <code>$10, $11, ...</code> may match the 10th, 11th, ... subexpressions 279: * if such subexpressions exist. 280: * 281: * @param input A string consisting of literals and <code>$<i>n</i></code> tokens. 282: */ 283: public String substituteInto(String input) { 284: // a la Perl, $0 is whole thing, $1 - $9 are subexpressions 285: StringBuffer output = new StringBuffer(); 286: int pos; 287: for (pos = 0; pos < input.length()-1; pos++) { 288: if ((input.charAt(pos) == '$') && (Character.isDigit(input.charAt(pos+1)))) { 289: int val = Character.digit(input.charAt(++pos),10); 290: int pos1 = pos + 1; 291: while (pos1 < input.length() && 292: Character.isDigit(input.charAt(pos1))) { 293: int val1 = val*10 + Character.digit(input.charAt(pos1),10); 294: if (val1 >= start.length) break; 295: pos1++; 296: val = val1; 297: } 298: pos = pos1 - 1; 299: 300: if (val < start.length) { 301: output.append(toString(val)); 302: } 303: } else output.append(input.charAt(pos)); 304: } 305: if (pos < input.length()) output.append(input.charAt(pos)); 306: return output.toString(); 307: } 308: 309: /* The following are used for debugging purpose 310: static String d(REMatch m) { 311: if (m == null) return "null"; 312: else return "[" + m.index + "]"; 313: } 314: 315: String substringUptoIndex(CharIndexed input) { 316: StringBuffer sb = new StringBuffer(); 317: for (int i = 0; i < index; i++) { 318: sb.append(input.charAt(i)); 319: } 320: return sb.toString(); 321: } 322: */ 323: 324: }