Class HTML5::EncodingParser
In: lib/feed_tools/vendor/html5/lib/html5/inputstream.rb
Parent: Object
Phase XmlElementPhase InTablePhase RootElementPhase AfterHeadPhase InHeadPhase AfterFramesetPhase XmlRootPhase InitialPhase InFramesetPhase InColumnGroupPhase InTableBodyPhase InCaptionPhase BeforeHeadPhase TrailingEndPhase InSelectPhase InCellPhase AfterBodyPhase InBodyPhase InRowPhase Exception SerializeError EOF AssertionError ParseError HTMLSanitizer HTMLTokenizer XhmlRootPhase String EncodingBytes XMLParser XHTMLParser HTMLParser HTMLSerializer XHTMLSerializer TreeWalkers::Base NonRecursiveTreeWalker TreeWalker TreeWalker Base TreeWalker Element DocumentFragment Node CommentNode DocumentType TextNode Document Base::Node Node Node Base::TreeBuilder TreeBuilder TreeBuilder TreeBuilder Element DocumentFragment CommentNode DocumentType TextNode Document Element DocumentFragment CommentNode DocumentType TextNode Document Base OptionalTagFilter InjectMetaCharset WhitespaceFilter HTMLSanitizeFilter HTMLSanitizeModule Enumerable TestData SimpleDelegator HTMLInputStream EncodingParser ContentAttrParser Node TreeBuilder lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb lib/feed_tools/vendor/html5/lib/html5/constants.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/inputstream.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb Hpricot TokenConstructor lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb SimpleTree TreeWalkers HTMLSanitizeModule lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb Hpricot lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb Base lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb SimpleTree TreeBuilders lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb lib/feed_tools/vendor/html5/lib/html5/filters/base.rb lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb Filters Sniffer lib/feed_tools/vendor/html5/tests/preamble.rb TestSupport HTML5 dot/m_75_0.png

Mini parser for detecting character encoding from meta elements

Methods

Public Class methods

string - the data to work on for encoding detection

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 412
412:     def initialize(data)
413:       @data = EncodingBytes.new(data.to_s)
414:       @encoding = nil
415:     end

Public Instance methods

Return a name,value pair for the next attribute in the stream, if one is found, or nil

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 514
514:     def get_attribute
515:       @data.skip(SPACE_CHARACTERS + ['/'])
516: 
517:       if @data.current_byte == '<'
518:         @data.position -= 1
519:         return nil
520:       elsif @data.current_byte == '>'
521:         return nil
522:       end
523: 
524:       attr_name = []
525:       attr_value = []
526:       space_found = false
527:       #Step 5 attribute name
528:       while true
529:         if @data.current_byte == '=' and attr_name
530:           break
531:         elsif SPACE_CHARACTERS.include?(@data.current_byte)
532:           space_found = true
533:           break
534:         elsif ['/', '<', '>'].include?(@data.current_byte)
535:           return [attr_name.join(''), '']
536:         elsif ASCII_UPPERCASE.include?(@data.current_byte)
537:           attr_name.push(@data.current_byte.downcase)
538:         else
539:           attr_name.push(@data.current_byte)
540:         end
541:         #Step 6
542:         @data.position += 1
543:       end
544:       #Step 7
545:       if space_found
546:         @data.skip
547:         #Step 8
548:         unless @data.current_byte == '='
549:           @data.position -= 1
550:           return [attr_name.join(''), '']
551:         end
552:       end
553:       #XXX need to advance position in both spaces and value case
554:       #Step 9
555:       @data.position += 1
556:       #Step 10
557:       @data.skip
558:       #Step 11
559:       if ["'", '"'].include?(@data.current_byte)
560:         #11.1
561:         quote_char = @data.current_byte
562:         while true
563:           @data.position+=1
564:           #11.3
565:           if @data.current_byte == quote_char
566:             @data.position += 1
567:             return [attr_name.join(''), attr_value.join('')]
568:           #11.4
569:           elsif ASCII_UPPERCASE.include?(@data.current_byte)
570:             attr_value.push(@data.current_byte.downcase)
571:           #11.5
572:           else
573:             attr_value.push(@data.current_byte)
574:           end
575:         end
576:       elsif ['>', '<'].include?(@data.current_byte)
577:         return [attr_name.join(''), '']
578:       elsif ASCII_UPPERCASE.include?(@data.current_byte)
579:         attr_value.push(@data.current_byte.downcase)
580:       else
581:         attr_value.push(@data.current_byte)
582:       end
583:       while true
584:         @data.position += 1
585:         if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
586:           return [attr_name.join(''), attr_value.join('')]
587:         elsif ASCII_UPPERCASE.include?(@data.current_byte)
588:           attr_value.push(@data.current_byte.downcase)
589:         else
590:           attr_value.push(@data.current_byte)
591:         end
592:       end
593:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 426
426:     def get_encoding
427:       @data.each do |byte|
428:         keep_parsing = true
429:         @@method_dispatch.each do |(key, method)|
430:           if @data.match_bytes(key, lower = true)
431:             keep_parsing = send(method)
432:             break
433:           end
434:         end
435:         break unless keep_parsing
436:       end
437:       @encoding = @encoding.strip unless @encoding.nil?
438:       return @encoding
439:     end

Skip over comments

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 442
442:     def handle_comment
443:       return @data.jump_to('-->')
444:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 446
446:     def handle_meta
447:       # if we have <meta not followed by a space so just keep going
448:       return true unless SPACE_CHARACTERS.include?(@data.current_byte)
449: 
450:       #We have a valid meta element we want to search for attributes
451:       while true
452:         #Try to find the next attribute after the current position
453:         attr = get_attribute
454: 
455:         return true if attr.nil?
456:         
457:         if attr[0] == 'charset'
458:           tentative_encoding = attr[1]
459:           if HTML5.is_valid_encoding(tentative_encoding)
460:             @encoding = tentative_encoding  
461:             return false
462:           end
463:         elsif attr[0] == 'content'
464:           content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
465:           tentative_encoding = content_parser.parse
466:           if HTML5.is_valid_encoding(tentative_encoding)
467:             @encoding = tentative_encoding
468:             return false
469:           end
470:         end
471:       end
472:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 508
508:     def handle_other
509:       return @data.jump_to('>')
510:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 478
478:     def handle_possible_end_tag
479:       @data.position += 1
480:       return handle_possible_tag(true)
481:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 474
474:     def handle_possible_start_tag
475:       return handle_possible_tag(false)
476:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 483
483:     def handle_possible_tag(end_tag)
484:       unless ASCII_LETTERS.include?(@data.current_byte)
485:         #If the next byte is not an ascii letter either ignore this
486:         #fragment (possible start tag case) or treat it according to 
487:         #handleOther
488:         if end_tag
489:           @data.position -= 1
490:           handle_other
491:         end
492:         return true
493:       end
494:     
495:       @data.find_next(SPACE_CHARACTERS + ['<', '>'])
496: 
497:       if @data.current_byte == '<'
498:         #return to the first step in the overall "two step" algorithm
499:         #reprocessing the < byte
500:         @data.position -= 1  
501:       else
502:         #Read all attributes
503:         {} until get_attribute.nil?
504:       end
505:       return true
506:     end

[Validate]