Class FeedParser::HTML2TextParser
In: lib/feedparser/html2text-parser.rb
Parent: SGMLParser
RuntimeError UnknownFeedTypeException FeedItem\n[lib/feedparser/feedparser.rb\nlib/feedparser/html-output.rb\nlib/feedparser/text-output.rb] AtomItem RSSItem SGMLParser HTML2TextParser Feed\n[lib/feedparser/feedparser.rb\nlib/feedparser/html-output.rb\nlib/feedparser/text-output.rb] lib/feedparser/feedparser.rb lib/feedparser/html-output.rb lib/feedparser/sgml-parser.rb lib/feedparser/html2text-parser.rb FeedParser dot/m_6_0.png

this class provides a simple SGML parser that removes HTML tags

Methods

Constants

HTML_ENTITIES = { "quot" => 34, "amp" => 38, "lt" => 60, "gt" => 62, "apos" => 39, "nbsp" => 160, "iexcl" => 161, "cent" => 162, "pound" => 163, "curren" => 164, "yen" => 165, "brvbar" => 166, "sect" => 167, "uml" => 168, "copy" => 169, "ordf" => 170, "laquo" => 171, "not" => 172, "shy" => 173, "reg" => 174, "macr" => 175, "deg" => 176, "plusmn" => 177, "sup2" => 178, "sup3" => 179, "acute" => 180, "micro" => 181, "para" => 182, "middot" => 183, "cedil" => 184, "sup1" => 185, "ordm" => 186, "raquo" => 187, "frac14" => 188, "frac12" => 189, "frac34" => 190, "iquest" => 191, "Agrave" => 192, "Aacute" => 193, "Acirc" => 194, "Atilde" => 195, "Auml" => 196, "Aring" => 197, "AElig" => 198, "Ccedil" => 199, "Egrave" => 200, "Eacute" => 201, "Ecirc" => 202, "Euml" => 203, "Igrave" => 204, "Iacute" => 205, "Icirc" => 206, "Iuml" => 207, "ETH" => 208, "Ntilde" => 209, "Ograve" => 210, "Oacute" => 211, "Ocirc" => 212, "Otilde" => 213, "Ouml" => 214, "times" => 215, "Oslash" => 216, "Ugrave" => 217, "Uacute" => 218, "Ucirc" => 219, "Uuml" => 220, "Yacute" => 221, "THORN" => 222, "szlig" => 223, "agrave" => 224, "aacute" => 225, "acirc" => 226, "atilde" => 227, "auml" => 228, "aring" => 229, "aelig" => 230, "ccedil" => 231, "egrave" => 232, "eacute" => 233, "ecirc" => 234, "euml" => 235, "igrave" => 236, "iacute" => 237, "icirc" => 238, "iuml" => 239, "eth" => 240, "ntilde" => 241, "ograve" => 242, "oacute" => 243, "ocirc" => 244, "otilde" => 245, "ouml" => 246, "divide" => 247, "oslash" => 248, "ugrave" => 249, "uacute" => 250, "ucirc" => 251, "uuml" => 252, "yacute" => 253, "thorn" => 254, "yuml" => 255, "fnof" => 402, "Alpha" => 913, "Beta" => 914, "Gamma" => 915, "Delta" => 916, "Epsilon" => 917, "Zeta" => 918, "Eta" => 919, "Theta" => 920, "Iota" => 921, "Kappa" => 922, "Lambda" => 923, "Mu" => 924, "Nu" => 925, "Xi" => 926, "Omicron" => 927, "Pi" => 928, "Rho" => 929, "Sigma" => 931, "Tau" => 932, "Upsilon" => 933, "Phi" => 934, "Chi" => 935, "Psi" => 936, "Omega" => 937, "alpha" => 945, "beta" => 946, "gamma" => 947, "delta" => 948, "epsilon" => 949, "zeta" => 950, "eta" => 951, "theta" => 952, "iota" => 953, "kappa" => 954, "lambda" => 955, "mu" => 956, "nu" => 957, "xi" => 958, "omicron" => 959, "pi" => 960, "rho" => 961, "sigmaf" => 962, "sigma" => 963, "tau" => 964, "upsilon" => 965, "phi" => 966, "chi" => 967, "psi" => 968, "omega" => 969, "thetasym" => 977, "upsih" => 978, "piv" => 982, "bull" => 8226, "hellip" => 8230, "prime" => 8242, "Prime" => 8243, "oline" => 8254, "frasl" => 8260, "weierp" => 8472, "image" => 8465, "real" => 8476, "trade" => 8482, "alefsym" => 8501, "larr" => 8592, "uarr" => 8593, "rarr" => 8594, "darr" => 8595, "harr" => 8596, "crarr" => 8629, "lArr" => 8656, "uArr" => 8657, "rArr" => 8658, "dArr" => 8659, "hArr" => 8660, "forall" => 8704, "part" => 8706, "exist" => 8707, "empty" => 8709, "nabla" => 8711, "isin" => 8712, "notin" => 8713, "ni" => 8715, "prod" => 8719, "sum" => 8721, "minus" => 8722, "lowast" => 8727, "radic" => 8730, "prop" => 8733, "infin" => 8734, "ang" => 8736, "and" => 8743, "or" => 8744, "cap" => 8745, "cup" => 8746, "int" => 8747, "there4" => 8756, "sim" => 8764, "cong" => 8773, "asymp" => 8776, "ne" => 8800, "equiv" => 8801, "le" => 8804, "ge" => 8805, "sub" => 8834, "sup" => 8835, "nsub" => 8836, "sube" => 8838, "supe" => 8839, "oplus" => 8853, "otimes" => 8855, "perp" => 8869, "sdot" => 8901, "lceil" => 8968, "rceil" => 8969, "lfloor" => 8970, "rfloor" => 8971, "lang" => 9001, "rang" => 9002, "loz" => 9674, "spades" => 9824, "clubs" => 9827, "hearts" => 9829, "diams" => 9830, "OElig" => 338, "oelig" => 339, "Scaron" => 352, "scaron" => 353, "Yuml" => 376, "circ" => 710, "tilde" => 732, "ensp" => 8194, "emsp" => 8195, "thinsp" => 8201, "zwnj" => 8204, "zwj" => 8205, "lrm" => 8206, "rlm" => 8207, "ndash" => 8211, "mdash" => 8212, "lsquo" => 8216, "rsquo" => 8217, "sbquo" => 8218, "ldquo" => 8220, "rdquo" => 8221, "bdquo" => 8222, "dagger" => 8224, "Dagger" => 8225, "permil" => 8240, "lsaquo" => 8249, "rsaquo" => 8250, "euro" => 8364

Attributes

savedata  [R] 

Public Class methods

[Source]

     # File lib/feedparser/html2text-parser.rb, line 138
138:     def HTML2TextParser.entities
139:       return HTML_ENTITIES
140:     end

[Source]

    # File lib/feedparser/html2text-parser.rb, line 9
 9:     def initialize(verbose = false)
10:       @savedata = ''
11:       @pre = false
12:       @href = nil
13:       @links = []
14:       @imgs = []
15:       @img_index = '@'
16:       super(verbose)
17:     end

Public Instance methods

[Source]

     # File lib/feedparser/html2text-parser.rb, line 93
 93:     def close
 94:       super
 95:       if @links.length > 0
 96:         @savedata << "\n\n"
 97:         @links.each_index do |i|
 98:           @savedata << "[#{i+1}] #{@links[i]}\n"
 99:         end
100:       end
101:       if @imgs.length > 0
102:         @savedata << "\n\n"
103:         @imgs.each do |i|
104:           @savedata << "[#{i[0]}] #{i[1]}\n"
105:         end
106:       end
107:     end

[Source]

    # File lib/feedparser/html2text-parser.rb, line 26
26:     def handle_data(data)
27:       # let's remove all CR
28:       if not @pre
29:         data.gsub!(/\n/, ' ') 
30:         data.gsub!(/( )+/, ' ')
31:       end
32:       @savedata << data
33:     end

[Source]

    # File lib/feedparser/html2text-parser.rb, line 19
19:     def next_img_index
20:       n = @img_index[0] + 1
21:       @img_index = " "
22:       @img_index[0] = n
23:       return @img_index
24:     end

[Source]

     # File lib/feedparser/html2text-parser.rb, line 134
134:     def unknown_charref(ref)
135:       handle_data([ref.to_i].pack('U*'))
136:     end

[Source]

     # File lib/feedparser/html2text-parser.rb, line 109
109:     def unknown_endtag(tag)
110:       case tag
111:       when 'ul'
112:         @savedata << "\n"
113:       when 'b'
114:         @savedata << '*'
115:       when 'strong'
116:         @savedata << '*'
117:       when 'em'
118:         @savedata << '*'
119:       when 'u'
120:         @savedata << '_'
121:       when 'i'
122:         @savedata << '/'
123:       when 'pre'
124:         @savedata << "\n\n"
125:         @pre = false
126:       when 'a'
127:         if @href
128:           @savedata << "[#{@links.length}]"
129:           @href = nil
130:         end
131:       end
132:     end

[Source]

     # File lib/feedparser/html2text-parser.rb, line 401
401:     def unknown_entityref(ref)
402:       if HTML_ENTITIES.has_key?(ref)
403:         handle_data([HTML_ENTITIES[ref]].pack('U*'))
404:       else
405:         handle_data(ref)
406:       end
407:     end

[Source]

    # File lib/feedparser/html2text-parser.rb, line 35
35:     def unknown_starttag(tag, attrs)
36:       case tag
37:       when 'p', 'h4'
38:         @savedata << "\n\n"
39:       when 'h1'
40:         @savedata << "\n\n      "
41:       when 'h2'
42:         @savedata << "\n\n    "
43:       when 'h3'
44:         @savedata << "\n\n  "
45:       when 'br'
46:         @savedata << "\n"
47:       when 'ul'
48:         @savedata << "\n"
49:       when 'li'
50:         @savedata << "\n - "
51:       when 'b'
52:         @savedata << '*'
53:       when 'strong'
54:         @savedata << '*'
55:       when 'em'
56:         @savedata << '*'
57:       when 'u'
58:         @savedata << '_'
59:       when 'i'
60:         @savedata << '/'
61:       when 'pre'
62:         @savedata << "\n\n"
63:         @pre = true
64:       when 'a'
65:         # find href in args
66:         @href = nil
67:         attrs.each do |a|
68:           if a[0] == 'href'
69:             @href = a[1]
70:           end
71:         end
72:         if @href
73:           @links << @href.gsub(/^("|'|)(.*)("|')$/,'\2')
74:         end
75:       when 'img'
76:         # find src in args
77:         src = nil
78:         attrs.each do |a|
79:           if a[0] == 'src'
80:             src = a[1]
81:           end
82:         end
83:         if src
84:           idx = next_img_index
85:           @imgs << [ idx, src.gsub(/^("|'|)(.*)("|')$/,'\2') ]
86:           @savedata << "[#{idx}]"
87:         end
88:       else
89: #        puts "unknown tag: #{tag}"
90:       end
91:     end

[Validate]