Class | HTML5::HTMLInputStream |
In: |
lib/feed_tools/vendor/html5/lib/html5/inputstream.rb
|
Parent: | Object |
This class takes care of character encoding and removing or replacing incorrect byte-sequences and also provides column and line tracking.
char_encoding | [RW] | |
errors | [RW] | |
queue | [RW] |
Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source for use by the HTML5Lib.
source can be either a file-object, local filename or a string.
The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)
parseMeta - Look for a <meta> element containing encoding information
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 29 29: def initialize(source, options = {}) 30: @encoding = nil 31: @parse_meta = true 32: @chardet = true 33: 34: options.each {|name, value| instance_variable_set("@#{name}", value) } 35: 36: # Raw Stream 37: @raw_stream = open_stream(source) 38: 39: # Encoding Information 40: #Number of bytes to use when looking for a meta element with 41: #encoding information 42: @NUM_BYTES_META = 512 43: #Number of bytes to use when using detecting encoding using chardet 44: @NUM_BYTES_CHARDET = 256 45: #Number of bytes to use when reading content 46: @NUM_BYTES_BUFFER = 1024 47: 48: #Encoding to use if no other information can be found 49: @DEFAULT_ENCODING = 'windows-1252' 50: 51: #Detect encoding iff no explicit "transport level" encoding is supplied 52: if @encoding.nil? or not HTML5.is_valid_encoding(@encoding) 53: @char_encoding = detect_encoding 54: else 55: @char_encoding = @encoding 56: end 57: 58: # Read bytes from stream decoding them into Unicode 59: @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || '' 60: if @char_encoding == 'windows-1252' 61: @win1252 = true 62: elsif @char_encoding != 'utf-8' 63: require 'iconv' 64: begin 65: @buffer << @raw_stream.read unless @raw_stream.eof? 66: @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first 67: rescue 68: @win1252 = true 69: end 70: end 71: 72: @queue = [] 73: @errors = [] 74: 75: # Reset position in the list to read from 76: @tell = 0 77: @line = @col = 0 78: @line_lengths = [] 79: end
Read one character from the stream or queue if available. Return EOF when EOF is reached.
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 252 252: def char 253: unless @queue.empty? 254: return @queue.shift 255: else 256: if @tell + 3 > @buffer.length && !@raw_stream.eof? 257: # read next block 258: @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER) 259: @tell = 0 260: end 261: 262: c = @buffer[@tell] 263: @tell += 1 264: 265: case c 266: when 0x01..0x7F 267: if c == 0x0D 268: # normalize newlines 269: @tell += 1 if @buffer[@tell] == 0x0A 270: c = 0x0A 271: end 272: 273: # update position in stream 274: if c == 0x0a 275: @line_lengths << @col 276: @line += 1 277: @col = 0 278: else 279: @col += 1 280: end 281: 282: c.chr 283: 284: when 0x80..0xBF 285: if !@win1252 286: [0xFFFD].pack('U') # invalid utf-8 287: elsif c <= 0x9f 288: [ENTITIES_WINDOWS1252[c-0x80]].pack('U') 289: else 290: "\xC2" + c.chr # convert to utf-8 291: end 292: 293: when 0xC0..0xFF 294: if instance_variables.include?("@win1252") && @win1252 295: "\xC3" + (c - 64).chr # convert to utf-8 296: # from http://www.w3.org/International/questions/qa-forms-utf-8.en.php 297: elsif @buffer[@tell - 1..@tell + 3] =~ /^ 298: ( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte 299: | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs 300: | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte 301: | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates 302: | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 303: | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 304: | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 305: )/x 306: @tell += $1.length - 1 307: $1 308: else 309: [0xFFFD].pack('U') # invalid utf-8 310: end 311: 312: when 0x00 313: @errors.push("null-character") 314: [0xFFFD].pack('U') # null characters are invalid 315: 316: else 317: :EOF 318: end 319: end 320: end
Returns a string of characters from the stream up to but not including any character in characters or EOF. characters can be any container that supports the in method being called on it.
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 325 325: def chars_until(characters, opposite=false) 326: char_stack = [char] 327: 328: while char_stack.last != :EOF 329: break unless (characters.include?(char_stack.last)) == opposite 330: char_stack.push(char) 331: end 332: 333: # Put the character stopped on back to the front of the queue 334: # from where it came. 335: c = char_stack.pop 336: @queue.insert(0, c) unless c == :EOF 337: return char_stack.join('') 338: end
Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return nil
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 147 147: def detect_bom 148: bom_dict = { 149: "\xef\xbb\xbf" => 'utf-8', 150: "\xff\xfe" => 'utf-16le', 151: "\xfe\xff" => 'utf-16be', 152: "\xff\xfe\x00\x00" => 'utf-32le', 153: "\x00\x00\xfe\xff" => 'utf-32be' 154: } 155: 156: # Go to beginning of file and read in 4 bytes 157: string = @raw_stream.read(4) 158: return nil unless string 159: 160: # Try detecting the BOM using bytes from the string 161: encoding = bom_dict[string[0...3]] # UTF-8 162: seek = 3 163: unless encoding 164: # Need to detect UTF-32 before UTF-16 165: encoding = bom_dict[string] # UTF-32 166: seek = 4 167: unless encoding 168: encoding = bom_dict[string[0...2]] # UTF-16 169: seek = 2 170: end 171: end 172: 173: # Set the read position past the BOM if one was found, otherwise 174: # set it to the start of the stream 175: seek(string, encoding ? seek : 0) 176: 177: return encoding 178: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 94 94: def detect_encoding 95: 96: #First look for a BOM 97: #This will also read past the BOM if present 98: encoding = detect_bom 99: 100: #If there is no BOM need to look for meta elements with encoding 101: #information 102: if encoding.nil? and @parse_meta 103: encoding = detect_encoding_meta 104: end 105: 106: #Guess with chardet, if avaliable 107: if encoding.nil? and @chardet 108: begin 109: require 'rubygems' 110: require 'UniversalDetector' # gem install chardet 111: buffers = [] 112: detector = UniversalDetector::Detector.instance 113: detector.reset 114: until @raw_stream.eof? 115: buffer = @raw_stream.read(@NUM_BYTES_CHARDET) 116: break if !buffer or buffer.empty? 117: buffers << buffer 118: detector.feed(buffer) 119: break if detector.instance_eval {@done} 120: detector.instance_eval { 121: @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar 122: } 123: end 124: detector.close 125: encoding = detector.result['encoding'] 126: seek(buffers*'', 0) 127: rescue LoadError 128: end 129: end 130: 131: # If all else fails use the default encoding 132: if encoding.nil? 133: encoding = @DEFAULT_ENCODING 134: end 135: 136: #Substitute for equivalent encoding 137: if 'iso-8859-1' == encoding.downcase 138: encoding = 'windows-1252' 139: end 140: 141: encoding 142: end
Report the encoding declared by the meta element
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 228 228: def detect_encoding_meta 229: buffer = @raw_stream.read(@NUM_BYTES_META) 230: parser = EncodingParser.new(buffer) 231: seek(buffer, 0) 232: return parser.get_encoding 233: end
Produces a file object from source.
source can be either a file object, local filename or a string.
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 84 84: def open_stream(source) 85: # Already an IO like object 86: if source.respond_to?(:read) 87: source 88: else 89: # Treat source as a string and wrap in StringIO 90: StringIO.new(source) 91: end 92: end
Returns (line, col) of the current position in the stream.
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 236 236: def position 237: line, col = @line, @col 238: @queue.reverse.each do |c| 239: if c == "\n" 240: line -= 1 241: raise RuntimeError.new("col=#{col}") unless col == 0 242: col = @line_lengths[line] 243: else 244: col -= 1 245: end 246: end 247: return [line + 1, col] 248: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 180 180: def seek(buffer, n) 181: if @raw_stream.respond_to?(:unget) 182: @raw_stream.unget(buffer[n..-1]) 183: return 184: end 185: 186: if @raw_stream.respond_to?(:seek) 187: begin 188: @raw_stream.seek(n) 189: return 190: rescue Errno::ESPIPE 191: end 192: end 193: 194: #TODO: huh? 195: require 'delegate' 196: @raw_stream = SimpleDelegator.new(@raw_stream) 197: 198: class << @raw_stream 199: def read(chars=-1) 200: if chars == -1 or chars > @data.length 201: result = @data 202: @data = '' 203: return result if __getobj__.eof? 204: return result + __getobj__.read if chars == -1 205: return result + __getobj__.read(chars-result.length) 206: elsif @data.empty? 207: return __getobj__.read(chars) 208: else 209: result = @data[1...chars] 210: @data = @data[chars..-1] 211: return result 212: end 213: end 214: 215: def unget(data) 216: if !@data or @data.empty? 217: @data = data 218: else 219: @data += data 220: end 221: end 222: end 223: 224: @raw_stream.unget(buffer[n .. -1]) 225: end