Source for gnu.javax.swing.text.html.parser.support.low.Constants

   1: /* Constants.java --
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.javax.swing.text.html.parser.support.low;
  40: 
  41: import java.util.BitSet;
  42: 
  43: /**
  44:  * The parser constants and operations, directly related to the parser
  45:  * constants.
  46:  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  47:  */
  48: public class Constants
  49: {
  50:   /* Single character tokens are reflected into they ASCII codes. */
  51: 
  52:   /**
  53:    * Start of HTML token.
  54:    */
  55:   public static final int BEGIN = '<';
  56: 
  57:   /**
  58:    * End of HTML token.
  59:    */
  60:   public static final int END = '>';
  61: 
  62:   /**
  63:    * Exclamation (indicates SGML or comment).
  64:    */
  65:   public static final int EXCLAMATION = '!';
  66: 
  67:   /**
  68:    * Slash (indicates closing tag).
  69:    */
  70:   public static final int SLASH = '/';
  71: 
  72:   /**
  73:    * Equals sign.
  74:    */
  75:   public static final int EQ = '=';
  76: 
  77:   /**
  78:    * Quoting sign.
  79:    */
  80:   public static final int AP = '\'';
  81: 
  82:   /**
  83:    * Quoting sign.
  84:    */
  85:   public static final int QUOT = '"';
  86: 
  87:   /* The numbers of other tokens start outside the ascii space. */
  88:   /* String tokens */
  89: 
  90:   /**
  91:    * Double dash (--)
  92:    */
  93:   public static final int DOUBLE_DASH = 1000;
  94: 
  95:   /**
  96:    * The STYLE tag (needs special handling).
  97:    */
  98:   public static final int STYLE = 1001;
  99: 
 100:   /**
 101:    * The SCRIPT tag (needs special handling).
 102:    */
 103:   public static final int SCRIPT = 1002;
 104: 
 105:   /* Pattern tokens */
 106: 
 107:   /**
 108:    * HTML whitespace.
 109:    */
 110:   public static final int WS = 1003;
 111: 
 112:   /**
 113:    * Named or numeric entity,
 114:    */
 115:   public static final int ENTITY = 1004;
 116: 
 117:   /**
 118:    * Sequence of valid name characters (can start from digit).
 119:    */
 120:   public static final int NUMTOKEN = 1005;
 121: 
 122:   /* Complex tokens */
 123: 
 124:   /**
 125:    * Comment opening sequence.
 126:    */
 127:   public static final pattern COMMENT_OPEN =
 128:     new pattern(new node[]
 129:                 {
 130:                   new node(BEGIN), new node(WS, true), new node(EXCLAMATION),
 131:                   new node(WS, true), new node(DOUBLE_DASH),
 132:                 }
 133:                );
 134: 
 135:   /**
 136:    * Comment closing sequence
 137:    */
 138:   public static final pattern COMMENT_END =
 139:     new pattern(new node[]
 140:                 {
 141:                   new node(DOUBLE_DASH), new node(WS, true), new node(END)
 142:                 }
 143:                );
 144: 
 145:   /**
 146:    * Special case ---> (also is treated as end of comment).
 147:    */
 148:   public static final pattern COMMENT_TRIPLEDASH_END =
 149:     new pattern(new node[]
 150:                 {
 151:                   new node(DOUBLE_DASH), new node(NUMTOKEN), new node(END)
 152:                 }
 153:                );
 154: 
 155:   /**
 156:    * STYLE element heading pattern.
 157:    */
 158:   public static final pattern STYLE_OPEN =
 159:     new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(STYLE) });
 160: 
 161:   /**
 162:    * SCRIPT element heading pattern.
 163:    */
 164:   public static final pattern SCRIPT_OPEN =
 165:     new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(SCRIPT) });
 166: 
 167:   /**
 168:    * SGML element heading pattern.
 169:    */
 170:   public static final pattern SGML =
 171:     new pattern(new node[]
 172:                 {
 173:                   new node(BEGIN), new node(WS, true), new node(EXCLAMATION)
 174:                 }
 175:                );
 176: 
 177:   /**
 178:    * SCRIPT element closing pattern.
 179:    */
 180:   public static final pattern SCRIPT_CLOSE =
 181:     new pattern(new node[]
 182:                 {
 183:                   new node(BEGIN), new node(WS, true), new node(SLASH),
 184:                   new node(WS, true), new node(SCRIPT), new node(WS, true),
 185:                   new node(END)
 186:                 }
 187:                );
 188: 
 189:   /**
 190:    * STYLE element closing pattern.
 191:    */
 192:   public static final pattern STYLE_CLOSE =
 193:     new pattern(new node[]
 194:                 {
 195:                   new node(BEGIN), new node(WS, true), new node(SLASH),
 196:                   new node(WS, true), new node(STYLE), new node(WS, true),
 197:                   new node(END)
 198:                 }
 199:                );
 200: 
 201:   /**
 202:    * Ordinary HTML tag heading pattern.
 203:    */
 204:   public static final pattern TAG =
 205:     new pattern(new node[]
 206:                 {
 207:                   new node(BEGIN), new node(WS, true), new node(SLASH, true),
 208:                   new node(WS, true), new node(NUMTOKEN)
 209:                 }
 210:                );
 211: 
 212:   /* Special tokens */
 213: 
 214:   /**
 215:    * All other tokens.
 216:    */
 217:   public static final int OTHER = 1999;
 218: 
 219:   /**
 220:    * The UNICODE "end of text" control code
 221:    */
 222:   static final char ETX = 3;
 223: 
 224:   /**
 225:    * End of file.
 226:    */
 227:   public static final int EOF = ETX;
 228: 
 229:   /* Character categories */
 230: 
 231:   /**
 232:    * All single char tokens.
 233:    */
 234:   public static final BitSet bSINGLE_CHAR_TOKEN = new BitSet();
 235: 
 236:   /**
 237:    * Non letters and non numbers, allowed in HTML names.
 238:    */
 239:   public static final BitSet bSPECIAL = new BitSet();
 240: 
 241:   /**
 242:    * All letters, used in HTML names.
 243:    */
 244:   public static final BitSet bLETTER = new BitSet();
 245: 
 246:   /**
 247:    * Digits.
 248:    */
 249:   public static final BitSet bDIGIT = new BitSet();
 250: 
 251:   /**
 252:    * Both line breaks.
 253:    */
 254:   public static final BitSet bLINEBREAK = new BitSet();
 255: 
 256:   /**
 257:    * All whitespace.
 258:    */
 259:   public static final BitSet bWHITESPACE = new BitSet();
 260: 
 261:   /**
 262:    * Both quoting characters.
 263:    */
 264:   public static final BitSet bQUOTING = new BitSet();
 265: 
 266:   /**
 267:    * Valid name characters.
 268:    */
 269:   public static final BitSet bNAME = new BitSet();
 270: 
 271:   /* Entity subcategories */
 272: 
 273:   /**
 274:    * Named entity.
 275:    */
 276:   public static final int ENTITY_NAMED = 1;
 277: 
 278:   /**
 279:    * Numeric entity.
 280:    */
 281:   public static final int ENTITY_NUMERIC = 2;
 282: 
 283:   static
 284:   {
 285:     bQUOTING.set(AP);
 286:     bQUOTING.set(QUOT);
 287: 
 288:     bSINGLE_CHAR_TOKEN.set(BEGIN);
 289:     bSINGLE_CHAR_TOKEN.set(END);
 290:     bSINGLE_CHAR_TOKEN.set(EXCLAMATION);
 291:     bSINGLE_CHAR_TOKEN.set(SLASH);
 292:     bSINGLE_CHAR_TOKEN.set(EQ);
 293:     bSINGLE_CHAR_TOKEN.set(EOF);
 294: 
 295:     bSINGLE_CHAR_TOKEN.or(bQUOTING);
 296: 
 297:     bLINEBREAK.set('\r');
 298:     bLINEBREAK.set('\n');
 299: 
 300:     bWHITESPACE.set(' ');
 301:     bWHITESPACE.set('\t');
 302:     bWHITESPACE.set(0xC);
 303:     bWHITESPACE.or(bLINEBREAK);
 304: 
 305:     for (char i = '0'; i <= '9'; i++)
 306:       {
 307:         bDIGIT.set(i);
 308:       }
 309: 
 310:     for (char i = 'a'; i <= 'z'; i++)
 311:       {
 312:         bLETTER.set(i);
 313:       }
 314: 
 315:     for (char i = 'A'; i <= 'Z'; i++)
 316:       {
 317:         bLETTER.set(i);
 318:       }
 319: 
 320:     bSPECIAL.set('-');
 321:     bSPECIAL.set('_');
 322:     bSPECIAL.set(':');
 323:     bSPECIAL.set('.');
 324: 
 325:     bNAME.or(bLETTER);
 326:     bNAME.or(bDIGIT);
 327:     bNAME.or(bSPECIAL);
 328:   }
 329: 
 330:   /**
 331:    * Verifies if one of the tokens matches the end of string
 332:    * buffer. The last character in the string buffer is the
 333:    * "future character", some tokens needs to verify it the
 334:    * token does not continue "towards the future". If the token
 335:    * matches, it matches till "pre-last" character in the buffer.
 336:    * @param b
 337:    * @return
 338:    */
 339:   public Token endMatches(Buffer b)
 340:   {
 341:     if (b.length() < 2)
 342:       return null;
 343: 
 344:     int p = b.length() - 2;
 345: 
 346:     if (b.length() > 2 && b.charAt(p) == '-' && b.charAt(p - 1) == '-')
 347:       return new Token(DOUBLE_DASH, "--", b.getLocation(p - 1, p + 1));
 348: 
 349:     char last = b.charAt(p);
 350: 
 351:     if (bSINGLE_CHAR_TOKEN.get(last))
 352:       return new Token(last, last, b.getLocation(p, p + 1));
 353: 
 354:     char future = b.charAt(p + 1);
 355: 
 356:     // Check for numtokens, script and style:
 357:     if (bNAME.get(last) && !bNAME.get(future))
 358:       {
 359:         // Scan the history up:
 360:         int u = p - 1;
 361:         while (u >= 0 && bNAME.get(b.charAt(u)))
 362:           u--;
 363:         u++;
 364: 
 365:         char[] token = new char[ p - u + 1 ];
 366: 
 367:         // Found a numtoken
 368:         b.getChars(u, p + 1, token, 0);
 369: 
 370:         // Verify for the built-in tokens:
 371:         String e = new String(token);
 372: 
 373:         // found the entity reference
 374:         if (u > 0 && b.charAt(u - 1) == '&')
 375:           {
 376:             // The subsequent semicolon may be the part of the token
 377:             // as well. The semicolon must be ignored. This must be
 378:             // handled elsewhere.
 379:             return new Token(ENTITY, ENTITY_NAMED, "&" + e,
 380:                              b.getLocation(u - 1, p + 1)
 381:                             );
 382:           }
 383: 
 384:         // found the numeric entity reference
 385:         if (u > 1 && b.charAt(u - 1) == '#' && b.charAt(u - 2) == '&')
 386:           {
 387:             // The subsequent semicolon may be the part of the token
 388:             // as well. The semicolon must be ignored. This must be
 389:             // handled elsewhere.
 390:             return new Token(ENTITY, ENTITY_NUMERIC, "&#" + e,
 391:                              b.getLocation(u - 2, p + 2)
 392:                             );
 393:           }
 394: 
 395:         Location le = b.getLocation(u, p + 1);
 396: 
 397:         if (e.equalsIgnoreCase("SCRIPT"))
 398:           return new Token(SCRIPT, e, le);
 399:         else if (e.equalsIgnoreCase("STYLE"))
 400:           return new Token(STYLE, e, le);
 401:         else
 402:           return new Token(NUMTOKEN, e, le);
 403:       }
 404: 
 405:     // Check for whitespace
 406:     if (bWHITESPACE.get(last) && !bWHITESPACE.get(future))
 407:       {
 408:         // Scan the history up:
 409:         int u = p - 1;
 410:         while (u >= 0 && bWHITESPACE.get(b.charAt(u)))
 411:           u--;
 412:         u++;
 413: 
 414:         char[] token = new char[ p - u + 1 ];
 415:         b.getChars(u, p + 1, token, 0);
 416: 
 417:         return new Token(WS, new String(token), b.getLocation(u, p + 1));
 418:       }
 419: 
 420:     return null;
 421:   }
 422: }