Module FeedTools::HtmlHelper
In: lib/feed_tools/helpers/html_helper.rb
ActiveRecord::Base DatabaseFeedCache StandardError FeedAccessError FeedItem Feed URI Cloud Link Author Image Enclosure TextInput Category lib/feed_tools/feed_item.rb lib/feed_tools/feed.rb lib/feed_tools.rb lib/feed_tools/vendor/uri.rb lib/feed_tools/database_feed_cache.rb lib/feed_tools/feed_structures.rb FeedToolsHelper FeedItemHelper DebugHelper HtmlHelper FeedHelper XmlHelper RetrievalHelper UriHelper GenericHelper FEED_TOOLS_VERSION FeedTools dot/m_79_0.png

Methods for pulling remote data

Methods

Constants

TIDY_OPTIONS = [ :add_xml_decl, :add_xml_space, :alt_text, :assume_xml_procins, :bare, :clean, :css_prefix, :decorate_inferred_ul, :doctype, :drop_empty_paras, :drop_font_tags, :drop_proprietary_attributes, :enclose_block_text, :enclose_text, :escape_cdata, :fix_backslash, :fix_bad_comments, :fix_uri, :hide_comments, :hide_endtags, :indent_cdata, :input_xml, :join_classes, :join_styles, :literal_attributes, :logical_emphasis, :lower_literals, :merge_divs, :ncr, :new_blocklevel_tags, :new_empty_tags, :new_inline_tags, :new_pre_tags, :numeric_entities, :output_html, :output_xhtml, :output_xml, :preserve_entities, :quote_ampersand, :quote_marks, :quote_nbsp, :repeated_attributes, :replace_color, :show_body_only, :uppercase_attributes, :uppercase_tags, :word_2000, :accessibility_check, :show_errors, :show_warnings, :break_before_br, :indent, :indent_attributes, :indent_spaces, :markup, :punctuation_wrap, :split, :tab_size, :vertical_space, :wrap, :wrap_asp, :wrap_attributes, :wrap_jste, :wrap_php, :wrap_script_literals, :wrap_sections, :ascii_chars, :char_encoding, :input_encoding, :language, :newline, :output_bom, :output_encoding, :error_file, :force_output, :gnu_emacs, :gnu_emacs_file, :keep_time, :output_file, :quiet, :slide_style, :tidy_mark, :write_back

Public Class methods

Removes all html tags from the html formatted text and removes escaped entities.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 93
 93:     def self.convert_html_to_plain_text(html)
 94:       return nil if html.nil?
 95:       stripped_html = html
 96:       stripped_html = FeedTools::HtmlHelper.strip_html_tags(stripped_html)
 97:       stripped_html = FeedTools::HtmlHelper.unescape_entities(stripped_html)
 98:       stripped_html.gsub!(/‘/, "'")
 99:       stripped_html.gsub!(/’/, "'")
100:       stripped_html.gsub!(/“/, "\"")
101:       stripped_html.gsub!(/”/, "\"")
102:       return stripped_html  
103:     end

Escapes all html entities

[Source]

    # File lib/feed_tools/helpers/html_helper.rb, line 56
56:     def self.escape_entities(html)
57:       return nil if html.nil?
58:       escaped_html = CGI.escapeHTML(html)
59:       escaped_html.gsub!(/'/, "'")
60:       escaped_html.gsub!(/"/, """)
61:       return escaped_html
62:     end

Given a block of html, locates feed links with a given mime type.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 563
563:     def self.extract_link_by_mime_type(html, mime_type)
564:       require 'feed_tools/helpers/xml_helper'
565:       
566:       # HACK: Prevent the parser from freaking out if it sees this:
567:       html = html.gsub(/<!'/, "&lt;!'")
568: 
569:       # This is technically very, very wrong.  But it saves oodles of
570:       # clock cycles, and probably works 99.999% of the time.
571:       html.gsub!(/<body.*?>(.|\n)*?<\/body>/, "<body></body>")
572:       html.gsub!(/<script.*?>(.|\n)*?<\/script>/, "")
573:       html.gsub!(/<noscript.*?>(.|\n)*?<\/noscript>/, "")
574:       html.gsub!(/<!--(.|\n)*?-->/, "")
575:       
576:       html = FeedTools::HtmlHelper.tidy_html(html)
577:       
578:       document = HTML5::HTMLParser.parse(html)
579: 
580:       link_nodes = []
581:       get_link_nodes = lambda do |root_node|
582:         html_node = nil
583:         head_node = nil
584:         return nil if !root_node.respond_to?(:children)
585:         if root_node.name.downcase == "html" &&
586:             root_node.children.size > 0
587:           html_node = root_node
588:         else
589:           for node in fragment_node.children
590:             next unless node.kind_of?(REXML::Element)
591:             if node.name.downcase == "html" &&
592:                 node.children.size > 0
593:               html_node = node
594:               break
595:             end
596:           end
597:         end
598:         if html_node != nil
599:           for node in html_node.children
600:             next unless node.kind_of?(REXML::Element)
601:             if node.name.downcase == "head"
602:               head_node = node
603:               break
604:             end
605:             if node.name.downcase == "link"
606:               link_nodes << node
607:             end
608:           end
609:           if html_node != nil || !link_nodes.empty?
610:             if head_node != nil
611:               link_nodes = []
612:               for node in head_node.children
613:                 next unless node.kind_of?(REXML::Element)
614:                 if node.name.downcase == "link"
615:                   link_nodes << node
616:                 end
617:               end
618:             end
619:           end
620:         end
621:       end
622:       get_link_nodes.call(document.root)
623:       process_link_nodes = lambda do |links|
624:         for link in links
625:           next unless link.kind_of?(REXML::Element)
626:           if link.attributes['type'].to_s.strip.downcase ==
627:               mime_type.downcase &&
628:               link.attributes['rel'].to_s.strip.downcase == "alternate"
629:             href = link.attributes['href']
630:             return href unless href.blank?
631:           end
632:         end
633:         for link in links
634:           next unless link.kind_of?(REXML::Element)
635:           process_link_nodes.call(link.children)
636:         end
637:       end
638:       process_link_nodes.call(link_nodes)
639:       return nil
640:     end

Returns a string containing normalized xhtml from within a REXML node.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 407
407:     def self.extract_xhtml(rexml_node)
408:       rexml_node_dup = rexml_node.deep_clone
409:       namespace_hash = FEED_TOOLS_NAMESPACES.dup
410:       normalize_namespaced_xhtml = lambda do |node, node_dup|
411:         if node.kind_of? REXML::Element
412:           node_namespace = node.namespace
413:           if node_namespace != namespace_hash['atom10'] &&
414:               node_namespace != namespace_hash['atom03']
415:             # Massive hack, relies on REXML not changing
416:             for index in 0...node.attributes.values.size
417:               attribute = node.attributes.values[index]
418:               attribute_dup = node_dup.attributes.values[index]
419:               if attribute.namespace == namespace_hash['xhtml']
420:                 attribute_dup.instance_variable_set(
421:                   "@expanded_name", attribute.name)
422:               end
423:               if node_namespace == namespace_hash['xhtml']
424:                 if attribute.name == 'xmlns'
425:                   node_dup.attributes.delete('xmlns')
426:                 end
427:               end
428:             end
429:             if node_namespace == namespace_hash['xhtml']
430:               node_dup.instance_variable_set("@expanded_name", node.name)
431:             end
432:             if !node_namespace.blank? && node.prefix.blank?
433:               if node_namespace != namespace_hash['xhtml']
434:                 prefix = nil
435:                 for known_prefix in namespace_hash.keys
436:                   if namespace_hash[known_prefix] == node_namespace
437:                     prefix = known_prefix
438:                   end
439:                 end
440:                 if prefix.nil?
441:                   prefix = "unknown" +
442:                     Digest::SHA1.new(node_namespace).to_s[0..4]
443:                   namespace_hash[prefix] = node_namespace
444:                 end
445:                 node_dup.instance_variable_set("@expanded_name",
446:                   "#{prefix}:#{node.name}")
447:                 node_dup.instance_variable_set("@prefix",
448:                   prefix)
449:                 node_dup.add_namespace(prefix, node_namespace)
450:               end
451:             end
452:           end
453:         end
454:         for index in 0...node.children.size
455:           child = node.children[index]
456:           if child.kind_of? REXML::Element
457:             child_dup = node_dup.children[index]
458:             normalize_namespaced_xhtml.call(child, child_dup)
459:           end
460:         end
461:       end
462:       normalize_namespaced_xhtml.call(rexml_node, rexml_node_dup)
463:       buffer = ""
464:       rexml_node_dup.each_child do |child|
465:         if child.kind_of? REXML::Comment
466:           buffer << "<!--" + child.to_s + "-->"
467:         else
468:           buffer << child.to_s
469:         end
470:       end
471:       return buffer.strip
472:     end

Returns true if the type string provided indicates that something is html or xhtml content.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 310
310:     def self.html_type?(type)
311:       return [
312:         "html",
313:         "xhtml",
314:         "text/html",
315:         "application/xhtml+xml"
316:       ].include?(type)
317:     end

Indents a text selection by a specified number of spaces.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 256
256:     def self.indent(text, spaces)
257:       lines = text.split("\n")
258:       buffer = ""
259:       for line in lines
260:         line = " " * spaces + line
261:         buffer << line << "\n"
262:       end
263:       return buffer
264:     end

Returns true if the type string provided indicates that something is only html (not xhtml) content.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 321
321:     def self.only_html_type?(type)
322:       return [
323:         "html",
324:         "text/html"
325:       ].include?(type)
326:     end

Given a REXML node, returns its content, normalized as HTML.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 475
475:     def self.process_text_construct(content_node, feed_type, feed_version,
476:         base_uri_sources=[])
477:       if content_node.nil?
478:         return nil
479:       end
480:       
481:       content = nil
482:       root_node_name = nil
483:       type = FeedTools::XmlHelper.try_xpaths(content_node, "@type",
484:         :select_result_value => true)
485:       mode = FeedTools::XmlHelper.try_xpaths(content_node, "@mode",
486:         :select_result_value => true)
487:       encoding = FeedTools::XmlHelper.try_xpaths(content_node, "@encoding",
488:         :select_result_value => true)
489: 
490:       if type.nil?
491:         atom_namespaces = [
492:           FEED_TOOLS_NAMESPACES['atom10'],
493:           FEED_TOOLS_NAMESPACES['atom03']
494:         ]
495:         if ((atom_namespaces.include?(content_node.namespace) ||
496:             atom_namespaces.include?(content_node.root.namespace)) ||
497:             feed_type == "atom")
498:           type = "text"
499:         end
500:       end
501:         
502:       # Note that we're checking for misuse of type, mode and encoding here
503:       if content_node.cdatas.size > 0
504:         content = content_node.cdatas.first.to_s.strip
505:       elsif type == "base64" || mode == "base64" ||
506:           encoding == "base64"
507:         content = Base64.decode64(content_node.inner_xml.strip)
508:       elsif type == "xhtml" || mode == "xhtml" ||
509:           type == "xml" || mode == "xml" ||
510:           type == "application/xhtml+xml" ||
511:           content_node.namespace == FEED_TOOLS_NAMESPACES['xhtml']
512:         content = FeedTools::HtmlHelper.extract_xhtml(content_node)
513:       elsif type == "escaped" || mode == "escaped" ||
514:           type == "html" || mode == "html" ||
515:           type == "text/html" || mode == "text/html"
516:         content = FeedTools::HtmlHelper.unescape_entities(
517:           content_node.inner_xml.strip)
518:       elsif type == "text" || mode == "text" ||
519:           type == "text/plain" || mode == "text/plain"
520:         content = FeedTools::HtmlHelper.unescape_entities(
521:           content_node.inner_xml.strip)
522:       else
523:         content = FeedTools::HtmlHelper.unescape_entities(
524:           content_node.inner_xml.strip)
525:       end
526:       if type == "text" || mode == "text" ||
527:           type == "text/plain" || mode == "text/plain"
528:         content = FeedTools::HtmlHelper.escape_entities(content)
529:       end        
530:       unless content.nil?
531:         content = FeedTools::HtmlHelper.resolve_relative_uris(content,
532:           [content_node.base_uri] | base_uri_sources)
533:         content = FeedTools::HtmlHelper.tidy_html(content)
534:       end
535:       if FeedTools.configurations[:tab_spaces] != nil
536:         spaces = FeedTools.configurations[:tab_spaces].to_i
537:         content.gsub!("\t", " " * spaces) unless content.blank?
538:       end
539:       content.strip unless content.blank?
540:       content = nil if content.blank?
541:       return content
542:     end

Resolves all relative uris in a block of html.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 329
329:     def self.resolve_relative_uris(html, base_uri_sources=[])
330:       relative_uri_attributes = [
331:         ["a", "href"],
332:         ["applet", "codebase"],
333:         ["area", "href"],
334:         ["blockquote", "cite"],
335:         ["body", "background"],
336:         ["del", "cite"],
337:         ["form", "action"],
338:         ["frame", "longdesc"],
339:         ["frame", "src"],
340:         ["iframe", "longdesc"],
341:         ["iframe", "src"],
342:         ["head", "profile"],
343:         ["img", "longdesc"],
344:         ["img", "src"],
345:         ["img", "usemap"],
346:         ["input", "src"],
347:         ["input", "usemap"],
348:         ["ins", "cite"],
349:         ["link", "href"],
350:         ["object", "classid"],
351:         ["object", "codebase"],
352:         ["object", "data"],
353:         ["object", "usemap"],
354:         ["q", "cite"],
355:         ["script", "src"]
356:       ]
357:       
358:       # HACK: Prevent the parser from freaking out if it sees this:
359:       html.gsub!(/<!'/, "&lt;!'")
360:       
361:       if FeedTools.configurations[:sanitization_enabled]
362:         fragments = HTML5::HTMLParser.parse_fragment(
363:           html, :tokenizer => HTML5::HTMLSanitizer, :encoding => 'UTF-8')
364:       else
365:         fragments = HTML5::HTMLParser.parse_fragment(html)
366:       end
367:       resolve_node = lambda do |html_node|
368:         if html_node.kind_of? REXML::Element
369:           for element_name, attribute_name in relative_uri_attributes
370:             if html_node.name.downcase == element_name
371:               attribute = html_node.attribute(attribute_name)
372:               if attribute != nil
373:                 href = attribute.value
374:                 href = FeedTools::UriHelper.resolve_relative_uri(
375:                   href, [html_node.base_uri] | base_uri_sources)
376:                 href = FeedTools::UriHelper.normalize_url(href)
377:                 html_node.attribute(attribute_name).instance_variable_set(
378:                   "@value", href)
379:                 html_node.attribute(attribute_name).instance_variable_set(
380:                   "@unnormalized", href)
381:                 html_node.attribute(attribute_name).instance_variable_set(
382:                   "@normalized", href)
383:                 if html_node.attribute(attribute_name).value != href
384:                   warn("Failed to update href to resolved value.")
385:                 end
386:               end
387:             end
388:           end
389:         end
390:         if html_node.respond_to? :children
391:           for child in html_node.children
392:             resolve_node.call(child)
393:           end
394:         end
395:         html_node
396:       end
397:       fragments.each do |fragment|
398:         resolve_node.call(fragment)
399:       end
400:       html = (fragments.map do |stuff|
401:         stuff.to_s
402:       end).join("")
403:       return html
404:     end

Removes all html tags from the html formatted text, but leaves escaped entities alone.

[Source]

    # File lib/feed_tools/helpers/html_helper.rb, line 84
84:     def self.strip_html_tags(html)
85:       return nil if html.nil?
86:       stripped_html = html
87:       stripped_html.gsub!(/<\/?[^>]+>/, "")
88:       return stripped_html
89:     end

Strips semantically empty div wrapper elements

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 545
545:     def self.strip_wrapper_element(xhtml)
546:       return nil if xhtml.nil?
547:       return xhtml if xhtml.blank?
548:       begin
549:         doc = REXML::Document.new(xhtml.to_s.strip)
550:         if doc.children.size == 1
551:           child = doc.children[0]
552:           if child.kind_of?(REXML::Element) && child.name.downcase == "div"
553:             return child.inner_xml.strip
554:           end
555:         end
556:         return xhtml.to_s.strip
557:       rescue Exception
558:         return xhtml.to_s.strip
559:       end
560:     end

Returns true if the type string provided indicates that something is html or xhtml content.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 301
301:     def self.text_type?(type)
302:       return [
303:         "text",
304:         "text/plain"
305:       ].include?(type)
306:     end

Returns true if the html tidy module can be used.

Obviously, you need the tidy gem installed in order to run with html tidy features turned on.

This method does a fairly complicated, and probably unnecessarily desperate search for the libtidy library. If you want this thing to execute fast, the best thing to do is to set Tidy.path ahead of time. If Tidy.path is set, this method doesn‘t do much. If it‘s not set, it will do it‘s darnedest to find the libtidy library. If you set the LIBTIDYPATH environment variable to the libtidy library, it should be able to find it.

Once the library is located, this method will run much faster.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 119
119:     def self.tidy_enabled?
120:       # This is an override variable to keep tidy from being used even if it
121:       # is available.
122:       if FeedTools.configurations[:tidy_enabled] == false
123:         return false
124:       end
125:       if @tidy_enabled.nil? || @tidy_enabled == false
126:         @tidy_enabled = false
127:         begin
128:           require 'tidy'
129:           if Tidy.path.nil?
130:             # *Shrug*, just brute force it, I guess.  There's a lot of places
131:             # this thing might be hiding in, depending on platform and general
132:             # sanity of the person who installed the thing.  Most of these are
133:             # probably unlikely, but it's not like checking unlikely locations
134:             # hurts.  Much.  Especially if you actually find it.
135:             libtidy_locations = [
136:               '/usr/local/lib/libtidy.dylib',
137:               '/opt/local/lib/libtidy.dylib',
138:               '/usr/lib/libtidy.dylib',
139:               '/usr/local/lib/tidylib.dylib',
140:               '/opt/local/lib/tidylib.dylib',
141:               '/usr/lib/tidylib.dylib',
142:               '/usr/local/lib/tidy.dylib',
143:               '/opt/local/lib/tidy.dylib',
144:               '/usr/lib/tidy.dylib',
145:               '/usr/local/lib/libtidy.so',
146:               '/opt/local/lib/libtidy.so',
147:               '/usr/lib/libtidy.so',
148:               '/usr/local/lib/tidylib.so',
149:               '/opt/local/lib/tidylib.so',
150:               '/usr/lib/tidylib.so',
151:               '/usr/local/lib/tidy.so',
152:               '/opt/local/lib/tidy.so',
153:               '/usr/lib/tidy.so',
154:               'C:\Program Files\Tidy\tidy.dll',
155:               'C:\Tidy\tidy.dll',
156:               'C:\Ruby\bin\tidy.dll',
157:               'C:\Ruby\tidy.dll',
158:               '/usr/local/lib',
159:               '/opt/local/lib',
160:               '/usr/lib'
161:             ]
162:             # We just made this thing up, but if someone sets it, we'll
163:             # go ahead and check it
164:             unless ENV['LIBTIDYPATH'].nil?
165:               libtidy_locations =
166:                 libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
167:             end
168:             for path in libtidy_locations
169:               if File.exists? path
170:                 if File.ftype(path) == "file" || File.ftype(path) == "link"
171:                   Tidy.path = path
172:                   @tidy_enabled = true
173:                   break
174:                 elsif File.ftype(path) == "directory"
175:                   # Ok, now perhaps we're getting a bit more desperate
176:                   lib_paths =
177:                     `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
178:                   # If there's more than one, grab the first one and
179:                   # hope for the best, and if it doesn't work, then blame the
180:                   # user for not specifying more accurately.
181:                   tidy_path = lib_paths.split("\n").first
182:                   unless tidy_path.nil?
183:                     Tidy.path = tidy_path
184:                     @tidy_enabled = true
185:                     break
186:                   end
187:                 end
188:               end
189:             end
190:             # Still couldn't find it.
191:             unless @tidy_enabled
192:               @tidy_enabled = false
193:             end
194:           else
195:             @tidy_enabled = true
196:           end
197:         rescue LoadError
198:           # Tidy not installed, disable features that rely on tidy.
199:           @tidy_enabled = false
200:         end
201:       end
202:       return @tidy_enabled
203:     end

Tidys up the html

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 206
206:     def self.tidy_html(html, options = {})
207:       return nil if html.nil?
208:       FeedTools::GenericHelper.validate_options(TIDY_OPTIONS, options.keys)
209: 
210:       options = {
211:         :add_xml_decl => false,
212:         :char_encoding => "utf8",
213:         :doctype => "omit",
214:         :indent => false,
215:         :logical_emphasis => true,
216:         :markup => true,
217:         :show_warnings => false,
218:         :wrap => 0
219:       }.merge(options)
220: 
221:       if FeedTools::HtmlHelper.tidy_enabled?
222:         is_fragment = true
223:         html.gsub!(/&lt;!'/, "&amp;lt;!'")
224:         if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
225:             (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
226:           is_fragment = false
227:         end
228:         if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
229:           is_fragment = false
230:         end
231: 
232:         options[:show_body_only] = true if is_fragment
233: 
234:         # Tidy sucks?
235:         # TODO: find the correct set of tidy options to set so
236:         # that *ugly* hacks like this aren't necessary.
237:         html = html.gsub(/\302\240/, "\240")
238: 
239:         tidy_html = Tidy.open(options) do |tidy|       
240:           xml = tidy.clean(html)
241:           xml
242:         end
243:         tidy_html.strip!
244:       else
245:         tidy_html = html
246:       end
247:       
248:       if tidy_html.blank? && !html.blank?
249:         tidy_html = html.strip
250:       end
251:       
252:       return tidy_html
253:     end

Unescapes all html entities

[Source]

    # File lib/feed_tools/helpers/html_helper.rb, line 65
65:     def self.unescape_entities(html)
66:       return nil if html.nil?
67:       unescaped_html = html
68:       unescaped_html.gsub!(/&#x26;/, "&amp;")
69:       unescaped_html.gsub!(/&#38;/, "&amp;")
70:       substitute_numerical_entities = Proc.new do |s|
71:         m = $1
72:         m = "0#{m}" if m[0] == ?x
73:         [Integer(m)].pack('U*')
74:        end
75:       unescaped_html.gsub!(/&#0*((?:\d+)|(?:x[a-f0-9]+));/, &substitute_numerical_entities)
76:       unescaped_html = CGI.unescapeHTML(unescaped_html)
77:       unescaped_html.gsub!(/&apos;/, "'")
78:       unescaped_html.gsub!(/&quot;/, "\"")
79:       return unescaped_html
80:     end

Unindents a text selection by a specified number of spaces.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 267
267:     def self.unindent(text, spaces)
268:       lines = text.split("\n")
269:       buffer = ""
270:       for line in lines
271:         for index in 0...spaces
272:           if line[0...1] == " "
273:             line = line[1..-1]
274:           else
275:             break
276:           end
277:         end
278:         buffer << line << "\n"
279:       end
280:       return buffer
281:     end

Returns true if the type string provided indicates that something is xml or xhtml content.

[Source]

     # File lib/feed_tools/helpers/html_helper.rb, line 285
285:     def self.xml_type?(type)
286:       if [
287:         "xml",
288:         "xhtml",
289:         "application/xhtml+xml"
290:       ].include?(type)
291:         return true
292:       elsif type != nil && type[-3..-1] == "xml"
293:         return true
294:       else
295:         return false
296:       end
297:     end

[Validate]