Class | HTML5::HTMLTokenizer |
In: |
lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb
|
Parent: | Object |
This class takes care of tokenizing HTML.
content_model_flag | [RW] | |
current_token | [RW] | |
stream | [R] |
XXX need to fix documentation
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 26 26: def initialize(stream, options = {}) 27: @stream = HTMLInputStream.new(stream, options) 28: 29: # Setup the initial tokenizer state 30: @content_model_flag = :PCDATA 31: @state = :data_state 32: @escapeFlag = false 33: @lastFourChars = [] 34: 35: # The current token being created 36: @current_token = nil 37: 38: # Tokens to be processed. 39: @token_queue = [] 40: @lowercase_element_name = options[:lowercase_element_name] != false 41: @lowercase_attr_name = options[:lowercase_attr_name] != false 42: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 491 491: def after_attribute_name_state 492: data = @stream.char 493: if SPACE_CHARACTERS.include? data 494: @stream.chars_until(SPACE_CHARACTERS, true) 495: elsif data == "=" 496: @state = :before_attribute_value_state 497: elsif data == ">" 498: emit_current_token 499: elsif data == :EOF 500: @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"} 501: emit_current_token 502: elsif ASCII_LETTERS.include? data 503: @current_token[:data].push([data, ""]) 504: @state = :attribute_name_state 505: elsif data == "/" 506: process_solidus_in_tag 507: @state = :before_attribute_name_state 508: else 509: @current_token[:data].push([data, ""]) 510: @state = :attribute_name_state 511: end 512: return true 513: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 762 762: def after_doctype_name_state 763: data = @stream.char 764: if SPACE_CHARACTERS.include? data 765: elsif data == ">" 766: @token_queue << @current_token 767: @state = :data_state 768: elsif data == :EOF 769: @current_token[:correct] = false 770: @stream.unget(data) 771: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 772: @token_queue << @current_token 773: @state = :data_state 774: else 775: char_stack = [data] 776: 5.times { char_stack << stream.char } 777: token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE) 778: if token == "public" and !char_stack.include?(:EOF) 779: @state = :before_doctype_public_identifier_state 780: elsif token == "system" and !char_stack.include?(:EOF) 781: @state = :before_doctype_system_identifier_state 782: else 783: @stream.unget(char_stack) 784: @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}} 785: @state = :bogus_doctype_state 786: end 787: end 788: return true 789: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 849 849: def after_doctype_public_identifier_state 850: data = @stream.char 851: if SPACE_CHARACTERS.include?(data) 852: elsif data == "\"" 853: @current_token[:systemId] = "" 854: @state = :doctype_system_identifier_double_quoted_state 855: elsif data == "'" 856: @current_token[:systemId] = "" 857: @state = :doctype_system_identifier_single_quoted_state 858: elsif data == ">" 859: @token_queue << @current_token 860: @state = :data_state 861: elsif data == :EOF 862: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 863: @current_token[:correct] = false 864: @token_queue << @current_token 865: @state = :data_state 866: else 867: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 868: @state = :bogus_doctype_state 869: end 870: return true 871: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 929 929: def after_doctype_system_identifier_state 930: data = @stream.char 931: if SPACE_CHARACTERS.include?(data) 932: elsif data == ">" 933: @token_queue << @current_token 934: @state = :data_state 935: elsif data == :EOF 936: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 937: @current_token[:correct] = false 938: @token_queue << @current_token 939: @state = :data_state 940: else 941: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 942: @state = :bogus_doctype_state 943: end 944: return true 945: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 444 444: def attribute_name_state 445: data = @stream.char 446: leavingThisState = true 447: emitToken = false 448: if data == "=" 449: @state = :before_attribute_value_state 450: elsif data == :EOF 451: @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"} 452: @state = :data_state 453: emitToken = true 454: elsif ASCII_LETTERS.include? data 455: @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true) 456: leavingThisState = false 457: elsif data == ">" 458: # XXX If we emit here the attributes are converted to a dict 459: # without being checked and when the code below runs we error 460: # because data is a dict not a list 461: emitToken = true 462: elsif SPACE_CHARACTERS.include? data 463: @state = :after_attribute_name_state 464: elsif data == "/" 465: process_solidus_in_tag 466: @state = :before_attribute_name_state 467: else 468: @current_token[:data][-1][0] += data 469: leavingThisState = false 470: end 471: 472: if leavingThisState 473: # Attributes are not dropped at this stage. That happens when the 474: # start tag token is emitted so values can still be safely appended 475: # to attributes, but we do want to report the parse error in time. 476: if @lowercase_attr_name 477: @current_token[:data][-1][0] = @current_token[:data].last.first.downcase 478: end 479: @current_token[:data][0...-1].each {|name,value| 480: if @current_token[:data].last.first == name 481: @token_queue << {:type => :ParseError, :data => "duplicate-attribute"} 482: break # don't report an error more than once 483: end 484: } 485: # XXX Fix for above XXX 486: emit_current_token if emitToken 487: end 488: return true 489: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 538 538: def attribute_value_double_quoted_state 539: data = @stream.char 540: if data == "\"" 541: @state = :before_attribute_name_state 542: elsif data == "&" 543: process_entity_in_attribute 544: elsif data == :EOF 545: @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"} 546: emit_current_token 547: else 548: @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"]) 549: end 550: return true 551: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 553 553: def attribute_value_single_quoted_state 554: data = @stream.char 555: if data == "'" 556: @state = :before_attribute_name_state 557: elsif data == "&" 558: process_entity_in_attribute 559: elsif data == :EOF 560: @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"} 561: emit_current_token 562: else 563: @current_token[:data][-1][1] += data +\ 564: @stream.chars_until(["'", "&"]) 565: end 566: return true 567: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 569 569: def attribute_value_unquoted_state 570: data = @stream.char 571: if SPACE_CHARACTERS.include? data 572: @state = :before_attribute_name_state 573: elsif data == "&" 574: process_entity_in_attribute 575: elsif data == ">" 576: emit_current_token 577: elsif data == :EOF 578: @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"} 579: emit_current_token 580: else 581: @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS) 582: end 583: return true 584: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 423 423: def before_attribute_name_state 424: data = @stream.char 425: if SPACE_CHARACTERS.include? data 426: @stream.chars_until(SPACE_CHARACTERS, true) 427: elsif data == :EOF 428: @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"} 429: emit_current_token 430: elsif ASCII_LETTERS.include? data 431: @current_token[:data].push([data, ""]) 432: @state = :attribute_name_state 433: elsif data == ">" 434: emit_current_token 435: elsif data == "/" 436: process_solidus_in_tag 437: else 438: @current_token[:data].push([data, ""]) 439: @state = :attribute_name_state 440: end 441: return true 442: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 515 515: def before_attribute_value_state 516: data = @stream.char 517: if SPACE_CHARACTERS.include? data 518: @stream.chars_until(SPACE_CHARACTERS, true) 519: elsif data == "\"" 520: @state = :attribute_value_double_quoted_state 521: elsif data == "&" 522: @state = :attribute_value_unquoted_state 523: @stream.unget(data); 524: elsif data == "'" 525: @state = :attribute_value_single_quoted_state 526: elsif data == ">" 527: emit_current_token 528: elsif data == :EOF 529: @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"} 530: emit_current_token 531: else 532: @current_token[:data][-1][1] += data 533: @state = :attribute_value_unquoted_state 534: end 535: return true 536: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 723 723: def before_doctype_name_state 724: data = @stream.char 725: if SPACE_CHARACTERS.include? data 726: elsif data == ">" 727: @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"} 728: @current_token[:correct] = false 729: @token_queue << @current_token 730: @state = :data_state 731: elsif data == :EOF 732: @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"} 733: @current_token[:correct] = false 734: @token_queue << @current_token 735: @state = :data_state 736: else 737: @current_token[:name] = data 738: @state = :doctype_name_state 739: end 740: return true 741: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 791 791: def before_doctype_public_identifier_state 792: data = @stream.char 793: 794: if SPACE_CHARACTERS.include?(data) 795: elsif data == "\"" 796: @current_token[:publicId] = "" 797: @state = :doctype_public_identifier_double_quoted_state 798: elsif data == "'" 799: @current_token[:publicId] = "" 800: @state = :doctype_public_identifier_single_quoted_state 801: elsif data == ">" 802: @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"} 803: @current_token[:correct] = false 804: @token_queue << @current_token 805: @state = :data_state 806: elsif data == :EOF 807: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 808: @current_token[:correct] = false 809: @token_queue << @current_token 810: @state = :data_state 811: else 812: @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"} 813: @state = :bogus_doctype_state 814: end 815: 816: return true 817: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 873 873: def before_doctype_system_identifier_state 874: data = @stream.char 875: if SPACE_CHARACTERS.include?(data) 876: elsif data == "\"" 877: @current_token[:systemId] = "" 878: @state = :doctype_system_identifier_double_quoted_state 879: elsif data == "'" 880: @current_token[:systemId] = "" 881: @state = :doctype_system_identifier_single_quoted_state 882: elsif data == ">" 883: @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"} 884: @current_token[:correct] = false 885: @token_queue << @current_token 886: @state = :data_state 887: elsif data == :EOF 888: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 889: @current_token[:correct] = false 890: @token_queue << @current_token 891: @state = :data_state 892: else 893: @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"} 894: @state = :bogus_doctype_state 895: end 896: return true 897: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 586 586: def bogus_comment_state 587: # Make a new comment token and give it as value all the characters 588: # until the first > or :EOF (chars_until checks for :EOF automatically) 589: # and emit it. 590: @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))} 591: 592: # Eat the character directly after the bogus comment which is either a 593: # ">" or an :EOF. 594: @stream.char 595: @state = :data_state 596: return true 597: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 947 947: def bogus_doctype_state 948: data = @stream.char 949: @current_token[:correct] = false 950: if data == ">" 951: @token_queue << @current_token 952: @state = :data_state 953: elsif data == :EOF 954: # XXX EMIT 955: @stream.unget(data) 956: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 957: @current_token[:correct] = false 958: @token_queue << @current_token 959: @state = :data_state 960: end 961: return true 962: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 343 343: def close_tag_open_state 344: if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA) 345: if @current_token 346: char_stack = [] 347: 348: # So far we know that "</" has been consumed. We now need to know 349: # whether the next few characters match the name of last emitted 350: # start tag which also happens to be the current_token. We also need 351: # to have the character directly after the characters that could 352: # match the start tag name. 353: (@current_token[:name].length + 1).times do 354: char_stack.push(@stream.char) 355: # Make sure we don't get hit by :EOF 356: break if char_stack[-1] == :EOF 357: end 358: 359: # Since this is just for checking. We put the characters back on 360: # the stack. 361: @stream.unget(char_stack) 362: end 363: 364: if @current_token and 365: @current_token[:name].downcase == 366: char_stack[0...-1].join('').downcase and 367: (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1] 368: # Because the characters are correct we can safely switch to 369: # PCDATA mode now. This also means we don't have to do it when 370: # emitting the end tag token. 371: @content_model_flag = :PCDATA 372: else 373: @token_queue << {:type => :Characters, :data => "</"} 374: @state = :data_state 375: 376: # Need to return here since we don't want the rest of the 377: # method to be walked through. 378: return true 379: end 380: end 381: 382: data = @stream.char 383: if data == :EOF 384: @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"} 385: @token_queue << {:type => :Characters, :data => "</"} 386: @state = :data_state 387: elsif ASCII_LETTERS.include? data 388: @current_token = {:type => :EndTag, :name => data, :data => []} 389: @state = :tag_name_state 390: elsif data == ">" 391: @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"} 392: @state = :data_state 393: else 394: # XXX data can be _'_... 395: @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}} 396: @stream.unget(data) 397: @state = :bogus_comment_state 398: end 399: 400: return true 401: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 671 671: def comment_end_dash_state 672: data = @stream.char 673: if data == "-" 674: @state = :comment_end_state 675: elsif data == :EOF 676: @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"} 677: @token_queue << @current_token 678: @state = :data_state 679: else 680: @current_token[:data] += "-" + data +\ 681: @stream.chars_until("-") 682: # Consume the next character which is either a "-" or an :EOF as 683: # well so if there's a "-" directly after the "-" we go nicely to 684: # the "comment end state" without emitting a ParseError there. 685: @stream.char 686: end 687: return true 688: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 690 690: def comment_end_state 691: data = @stream.char 692: if data == ">" 693: @token_queue << @current_token 694: @state = :data_state 695: elsif data == "-" 696: @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"} 697: @current_token[:data] += data 698: elsif data == :EOF 699: @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"} 700: @token_queue << @current_token 701: @state = :data_state 702: else 703: # XXX 704: @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"} 705: @current_token[:data] += "--" + data 706: @state = :comment_state 707: end 708: return true 709: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 638 638: def comment_start_dash_state 639: data = @stream.char 640: if data == "-" 641: @state = :comment_end_state 642: elsif data == ">" 643: @token_queue << {:type => :ParseError, :data => "incorrect-comment"} 644: @token_queue << @current_token 645: @state = :data_state 646: elsif data == :EOF 647: @token_queue << {:type => :ParseError, :data => "eof-in-comment"} 648: @token_queue << @current_token 649: @state = :data_state 650: else 651: @current_token[:data] += '-' + data + @stream.chars_until("-") 652: @state = :comment_state 653: end 654: return true 655: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 619 619: def comment_start_state 620: data = @stream.char 621: if data == "-" 622: @state = :comment_start_dash_state 623: elsif data == ">" 624: @token_queue << {:type => :ParseError, :data => "incorrect-comment"} 625: @token_queue << @current_token 626: @state = :data_state 627: elsif data == :EOF 628: @token_queue << {:type => :ParseError, :data => "eof-in-comment"} 629: @token_queue << @current_token 630: @state = :data_state 631: else 632: @current_token[:data] += data + @stream.chars_until("-") 633: @state = :comment_state 634: end 635: return true 636: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 657 657: def comment_state 658: data = @stream.char 659: if data == "-" 660: @state = :comment_end_dash_state 661: elsif data == :EOF 662: @token_queue << {:type => :ParseError, :data => "eof-in-comment"} 663: @token_queue << @current_token 664: @state = :data_state 665: else 666: @current_token[:data] += data + @stream.chars_until("-") 667: end 668: return true 669: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 137 137: def consume_entity(from_attribute=false) 138: char = nil 139: char_stack = [@stream.char] 140: if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0]) 141: @stream.unget(char_stack) 142: elsif char_stack[0] == '#' 143: # We might have a number entity here. 144: char_stack += [@stream.char, @stream.char] 145: if char_stack[0 .. 1].include? :EOF 146: # If we reach the end of the file put everything up to :EOF 147: # back in the queue 148: char_stack = char_stack[0...char_stack.index(:EOF)] 149: @stream.unget(char_stack) 150: @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"} 151: else 152: if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2] 153: # Hexadecimal entity detected. 154: @stream.unget(char_stack[2]) 155: char = consume_number_entity(true) 156: elsif DIGITS.include? char_stack[1] 157: # Decimal entity detected. 158: @stream.unget(char_stack[1..-1]) 159: char = consume_number_entity(false) 160: else 161: # No number entity detected. 162: @stream.unget(char_stack) 163: @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"} 164: end 165: end 166: else 167: # At this point in the process might have named entity. Entities 168: # are stored in the global variable "entities". 169: # 170: # Consume characters and compare to these to a substring of the 171: # entity names in the list until the substring no longer matches. 172: filteredEntityList = ENTITIES.keys 173: filteredEntityList.reject! {|e| e[0].chr != char_stack[0]} 174: entityName = nil 175: 176: # Try to find the longest entity the string will match to take care 177: # of ¬i for instance. 178: while char_stack.last != :EOF 179: name = char_stack.join('') 180: if filteredEntityList.any? {|e| e[0...name.length] == name} 181: filteredEntityList.reject! {|e| e[0...name.length] != name} 182: char_stack.push(@stream.char) 183: else 184: break 185: end 186: 187: if ENTITIES.include? name 188: entityName = name 189: break if entityName[-1] == ';' 190: end 191: end 192: 193: if entityName != nil 194: char = ENTITIES[entityName] 195: 196: # Check whether or not the last character returned can be 197: # discarded or needs to be put back. 198: if entityName[-1] != ?; 199: @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"} 200: end 201: 202: if entityName[-1] != ";" and from_attribute and 203: (ASCII_LETTERS.include?(char_stack[entityName.length]) or 204: DIGITS.include?(char_stack[entityName.length])) 205: @stream.unget(char_stack) 206: char = '&' 207: else 208: @stream.unget(char_stack[entityName.length..-1]) 209: end 210: else 211: @token_queue << {:type => :ParseError, :data => "expected-named-entity"} 212: @stream.unget(char_stack) 213: end 214: end 215: return char 216: end
This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present @token_queue << {:type => :ParseError}" is invoked.
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 84 84: def consume_number_entity(isHex) 85: 86: # XXX More need to be done here. For instance, #13 should prolly be 87: # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and 88: # such. Thoughts on this appreciated. 89: allowed = DIGITS 90: radix = 10 91: if isHex 92: allowed = HEX_DIGITS 93: radix = 16 94: end 95: 96: char_stack = [] 97: 98: # Consume all the characters that are in range while making sure we 99: # don't hit an EOF. 100: c = @stream.char 101: while allowed.include?(c) and c != :EOF 102: char_stack.push(c) 103: c = @stream.char 104: end 105: 106: # Convert the set of characters consumed to an int. 107: charAsInt = char_stack.join('').to_i(radix) 108: 109: if charAsInt == 13 110: @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"} 111: charAsInt = 10 112: elsif (128..159).include? charAsInt 113: # If the integer is between 127 and 160 (so 128 and bigger and 159 114: # and smaller) we need to do the "windows trick". 115: @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"} 116: 117: charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128] 118: end 119: 120: if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343) 121: char = [charAsInt].pack('U') 122: else 123: char = [0xFFFD].pack('U') 124: @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}} 125: end 126: 127: # Discard the ; if present. Otherwise, put it back on the queue and 128: # invoke parse_error on parser. 129: if c != ";" 130: @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"} 131: @stream.unget(c) 132: end 133: 134: return char 135: end
XXX AT Perhaps we should have Hixie run some evaluation on billions of documents to figure out what the order of the various if and elsif statements should be.
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 249 249: def data_state 250: data = @stream.char 251: 252: if @content_model_flag == :CDATA or @content_model_flag == :RCDATA 253: @lastFourChars << data 254: @lastFourChars.shift if @lastFourChars.length > 4 255: end 256: 257: if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag 258: @state = :entity_data_state 259: elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--" 260: @escapeFlag = true 261: @token_queue << {:type => :Characters, :data => data} 262: elsif data == "<" and !@escapeFlag and 263: [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag) 264: @state = :tag_open_state 265: elsif data == ">" and @escapeFlag and 266: [:CDATA,:RCDATA].include?(@content_model_flag) and 267: @lastFourChars[1..-1].join('') == "-->" 268: @escapeFlag = false 269: @token_queue << {:type => :Characters, :data => data} 270: 271: elsif data == :EOF 272: # Tokenization ends. 273: return false 274: 275: elsif SPACE_CHARACTERS.include? data 276: # Directly after emitting a token you switch back to the "data 277: # state". At that point SPACE_CHARACTERS are important so they are 278: # emitted separately. 279: # XXX need to check if we don't need a special "spaces" flag on 280: # characters. 281: @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)} 282: else 283: @token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])} 284: end 285: return true 286: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 743 743: def doctype_name_state 744: data = @stream.char 745: if SPACE_CHARACTERS.include? data 746: @state = :after_doctype_name_state 747: elsif data == ">" 748: @token_queue << @current_token 749: @state = :data_state 750: elsif data == :EOF 751: @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"} 752: @current_token[:correct] = false 753: @token_queue << @current_token 754: @state = :data_state 755: else 756: @current_token[:name] += data 757: end 758: 759: return true 760: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 819 819: def doctype_public_identifier_double_quoted_state 820: data = @stream.char 821: if data == "\"" 822: @state = :after_doctype_public_identifier_state 823: elsif data == :EOF 824: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 825: @current_token[:correct] = false 826: @token_queue << @current_token 827: @state = :data_state 828: else 829: @current_token[:publicId] += data 830: end 831: return true 832: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 834 834: def doctype_public_identifier_single_quoted_state 835: data = @stream.char 836: if data == "'" 837: @state = :after_doctype_public_identifier_state 838: elsif data == :EOF 839: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 840: @current_token[:correct] = false 841: @token_queue << @current_token 842: @state = :data_state 843: else 844: @current_token[:publicId] += data 845: end 846: return true 847: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 711 711: def doctype_state 712: data = @stream.char 713: if SPACE_CHARACTERS.include? data 714: @state = :before_doctype_name_state 715: else 716: @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"} 717: @stream.unget(data) 718: @state = :before_doctype_name_state 719: end 720: return true 721: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 899 899: def doctype_system_identifier_double_quoted_state 900: data = @stream.char 901: if data == "\"" 902: @state = :after_doctype_system_identifier_state 903: elsif data == :EOF 904: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 905: @current_token[:correct] = false 906: @token_queue << @current_token 907: @state = :data_state 908: else 909: @current_token[:systemId] += data 910: end 911: return true 912: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 914 914: def doctype_system_identifier_single_quoted_state 915: data = @stream.char 916: if data == "'" 917: @state = :after_doctype_system_identifier_state 918: elsif data == :EOF 919: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"} 920: @current_token[:correct] = false 921: @token_queue << @current_token 922: @state = :data_state 923: else 924: @current_token[:systemId] += data 925: end 926: return true 927: end
This is where the magic happens.
We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested.
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 49 49: def each 50: @token_queue = [] 51: # Start processing. When EOF is reached @state will return false 52: # instead of true and the loop will terminate. 53: while send @state 54: yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty? 55: yield @token_queue.shift until @token_queue.empty? 56: end 57: end
This method is a generic handler for emitting the tags. It also sets the state to "data" because that‘s what‘s needed after a token has been emitted.
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 231 231: def emit_current_token 232: # Add token to the queue to be yielded 233: token = @current_token 234: if [:StartTag, :EndTag, :EmptyTag].include?(token[:type]) 235: if @lowercase_element_name 236: token[:name] = token[:name].downcase 237: end 238: @token_queue << token 239: @state = :data_state 240: end 241: 242: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 288 288: def entity_data_state 289: entity = consume_entity 290: if entity 291: @token_queue << {:type => :Characters, :data => entity} 292: else 293: @token_queue << {:type => :Characters, :data => "&"} 294: end 295: @state = :data_state 296: return true 297: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 599 599: def markup_declaration_open_state 600: char_stack = [@stream.char, @stream.char] 601: if char_stack == ["-", "-"] 602: @current_token = {:type => :Comment, :data => ""} 603: @state = :comment_start_state 604: else 605: 5.times { char_stack.push(@stream.char) } 606: # Put in explicit :EOF check 607: if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE" 608: @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true} 609: @state = :doctype_state 610: else 611: @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"} 612: @stream.unget(char_stack) 613: @state = :bogus_comment_state 614: end 615: end 616: return true 617: end
This method replaces the need for "entityInAttributeValueState".
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 219 219: def process_entity_in_attribute 220: entity = consume_entity() 221: if entity 222: @current_token[:data][-1][1] += entity 223: else 224: @current_token[:data][-1][1] += "&" 225: end 226: end
If the next character is a ’>’, convert the current_token into an EmptyTag
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 64 64: def process_solidus_in_tag 65: 66: # We need to consume another character to make sure it's a ">" 67: data = @stream.char 68: 69: if @current_token[:type] == :StartTag and data == ">" 70: @current_token[:type] = :EmptyTag 71: else 72: @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"} 73: end 74: 75: # The character we just consumed need to be put back on the stack so it 76: # doesn't get lost... 77: @stream.unget(data) 78: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 403 403: def tag_name_state 404: data = @stream.char 405: if SPACE_CHARACTERS.include? data 406: @state = :before_attribute_name_state 407: elsif data == :EOF 408: @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"} 409: emit_current_token 410: elsif ASCII_LETTERS.include? data 411: @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true) 412: elsif data == ">" 413: emit_current_token 414: elsif data == "/" 415: process_solidus_in_tag 416: @state = :before_attribute_name_state 417: else 418: @current_token[:name] += data 419: end 420: return true 421: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 299 299: def tag_open_state 300: data = @stream.char 301: if @content_model_flag == :PCDATA 302: if data == "!" 303: @state = :markup_declaration_open_state 304: elsif data == "/" 305: @state = :close_tag_open_state 306: elsif data != :EOF and ASCII_LETTERS.include? data 307: @current_token = {:type => :StartTag, :name => data, :data => []} 308: @state = :tag_name_state 309: elsif data == ">" 310: # XXX In theory it could be something besides a tag name. But 311: # do we really care? 312: @token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"} 313: @token_queue << {:type => :Characters, :data => "<>"} 314: @state = :data_state 315: elsif data == "?" 316: # XXX In theory it could be something besides a tag name. But 317: # do we really care? 318: @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"}) 319: @stream.unget(data) 320: @state = :bogus_comment_state 321: else 322: # XXX 323: @token_queue << {:type => :ParseError, :data => "expected-tag-name"} 324: @token_queue << {:type => :Characters, :data => "<"} 325: @stream.unget(data) 326: @state = :data_state 327: end 328: else 329: # We know the content model flag is set to either RCDATA or CDATA 330: # now because this state can never be entered with the PLAINTEXT 331: # flag. 332: if data == "/" 333: @state = :close_tag_open_state 334: else 335: @token_queue << {:type => :Characters, :data => "<"} 336: @stream.unget(data) 337: @state = :data_state 338: end 339: end 340: return true 341: end