Class | ActiveSupport::Multibyte::Handlers::UTF8Handler |
In: |
vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
|
Parent: | Object |
UTF8Handler implements Unicode aware operations for strings, these operations will be used by the Chars proxy when $KCODE is set to ‘UTF8’.
HANGUL_SBASE | = | 0xAC00 | Hangul character boundaries and properties | |
HANGUL_LBASE | = | 0x1100 | ||
HANGUL_VBASE | = | 0x1161 | ||
HANGUL_TBASE | = | 0x11A7 | ||
HANGUL_LCOUNT | = | 19 | ||
HANGUL_VCOUNT | = | 21 | ||
HANGUL_TCOUNT | = | 28 | ||
HANGUL_NCOUNT | = | HANGUL_VCOUNT * HANGUL_TCOUNT | ||
HANGUL_SCOUNT | = | 11172 | ||
HANGUL_SLAST | = | HANGUL_SBASE + HANGUL_SCOUNT | ||
HANGUL_JAMO_FIRST | = | 0x1100 | ||
HANGUL_JAMO_LAST | = | 0x11FF | ||
UNICODE_WHITESPACE | = | [ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D> 0x0020, # White_Space # Zs SPACE 0x0085, # White_Space # Cc <control-0085> 0x00A0, # White_Space # Zs NO-BREAK SPACE 0x1680, # White_Space # Zs OGHAM SPACE MARK 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE 0x2028, # White_Space # Zl LINE SEPARATOR 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE ].flatten.freeze | All the unicode whitespace | |
UNICODE_LEADERS_AND_TRAILERS | = | UNICODE_WHITESPACE + [65279] | BOM (byte order mark) can also be seen as whitespace, it‘s a non-rendering character used to distinguish between little and big endian. This is not an issue in utf-8, so it must be ignored. | |
UTF8_PAT | = | ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] | ||
UNICODE_TRAILERS_PAT | = | /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ | ||
UNICODE_LEADERS_PAT | = | /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ | ||
UCD | = | UnicodeDatabase.new | UniCode Database |
size | -> | length |
slice | -> | [] |
Works just like the indexed replace method on string, except instead of byte offsets you specify character offsets.
Example:
s = "Müller" s.chars[2] = "e" # Replace character with offset 2 s # => "Müeler" s = "Müller" s.chars[1, 2] = "ö" # Replace 2 characters at character offset 1 s # => "Möler"
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 147 147: def []=(str, *args) 148: replace_by = args.pop 149: # Indexed replace with regular expressions already works 150: return str[*args] = replace_by if args.first.is_a?(Regexp) 151: result = u_unpack(str) 152: if args[0].is_a?(Fixnum) 153: raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length 154: min = args[0] 155: max = args[1].nil? ? min : (min + args[1] - 1) 156: range = Range.new(min, max) 157: replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum) 158: elsif args.first.is_a?(Range) 159: raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length 160: range = args[0] 161: else 162: needle = args[0].to_s 163: min = index(str, needle) 164: max = min + length(needle) - 1 165: range = Range.new(min, max) 166: end 167: result[range] = u_unpack(replace_by) 168: str.replace(result.pack('U*')) 169: end
Returns a copy of str with the first character converted to uppercase and the remainder to lowercase
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 265 265: def capitalize(str) 266: upcase(slice(str, 0..0)) + downcase(slice(str, 1..-1) || '') 267: end
Works just like String#center, only integer specifies characters instead of bytes.
Example:
"¾ cup".chars.center(8).to_s # => " ¾ cup " "¾ cup".chars.center(8, " ").to_s # Use non-breaking whitespace # => " ¾ cup "
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 206 206: def center(str, integer, padstr=' ') 207: justify(str, integer, :center, padstr) 208: end
Perform composition on the characters in the string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 303 303: def compose(str) 304: compose_codepoints u_unpack(str).pack('U*') 305: end
Checks if the string is valid UTF8.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 333 333: def consumes?(str) 334: # Unpack is a little bit faster than regular expressions 335: begin 336: str.unpack('U*') 337: true 338: rescue ArgumentError 339: false 340: end 341: end
Perform decomposition on the characters in the string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 298 298: def decompose(str) 299: decompose_codepoints(:canonical, u_unpack(str)).pack('U*') 300: end
Convert characters in the string to lowercase
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 262 262: def downcase(str); to_case :lowercase_mapping, str; end
Returns the number of grapheme clusters in the string. This method is very likely to be moved or renamed in future versions.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 345 345: def g_length(str) 346: g_unpack(str).length 347: end
Returns the position of the passed argument in the string, counting in codepoints
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 130 130: def index(str, *args) 131: bidx = str.index(*args) 132: bidx ? (u_unpack(str.slice(0...bidx)).size) : nil 133: end
Inserts the passed string at specified codepoint offsets
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 120 120: def insert(str, offset, fragment) 121: str.replace( 122: u_unpack(str).insert( 123: offset, 124: u_unpack(fragment) 125: ).flatten.pack('U*') 126: ) 127: end
Works just like String#ljust, only integer specifies characters instead of bytes.
Example:
"¾ cup".chars.rjust(8).to_s # => "¾ cup " "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace # => "¾ cup "
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 193 193: def ljust(str, integer, padstr=' ') 194: justify(str, integer, :left, padstr) 195: end
Returns the KC normalization of the string by default. NFKC is considered the best normalization form for passing strings to databases and validations.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 280 280: def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM) 281: # See http://www.unicode.org/reports/tr15, Table 1 282: codepoints = u_unpack(str) 283: case form 284: when :d 285: reorder_characters(decompose_codepoints(:canonical, codepoints)) 286: when :c 287: compose_codepoints reorder_characters(decompose_codepoints(:canonical, codepoints)) 288: when :kd 289: reorder_characters(decompose_codepoints(:compatability, codepoints)) 290: when :kc 291: compose_codepoints reorder_characters(decompose_codepoints(:compatability, codepoints)) 292: else 293: raise ArgumentError, "#{form} is not a valid normalization variant", caller 294: end.pack('U*') 295: end
Reverses codepoints in the string.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 232 232: def reverse(str) 233: u_unpack(str).reverse.pack('U*') 234: end
Works just like String#rjust, only integer specifies characters instead of bytes.
Example:
"¾ cup".chars.rjust(8).to_s # => " ¾ cup" "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace # => " ¾ cup"
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 180 180: def rjust(str, integer, padstr=' ') 181: justify(str, integer, :right, padstr) 182: end
Returns the number of codepoints in the string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 226 226: def size(str) 227: u_unpack(str).size 228: end
Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that character.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 238 238: def slice(str, *args) 239: if args.size > 2 240: raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native 241: elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp))) 242: raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native 243: elsif (args.size == 2 && !args[1].is_a?(Numeric)) 244: raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native 245: elsif args[0].kind_of? Range 246: cps = u_unpack(str).slice(*args) 247: cps.nil? ? nil : cps.pack('U*') 248: elsif args[0].kind_of? Regexp 249: str.slice(*args) 250: elsif args.size == 1 && args[0].kind_of?(Numeric) 251: u_unpack(str)[args[0]] 252: else 253: u_unpack(str).slice(*args).pack('U*') 254: end 255: end
Removed leading and trailing whitespace
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 221 221: def strip(str) 222: str.gsub(UNICODE_LEADERS_PAT, '').gsub(UNICODE_TRAILERS_PAT, '') 223: end
Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 350 350: def tidy_bytes(str) 351: str.split(//u).map do |c| 352: if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) 353: n = c.unpack('C')[0] 354: n < 128 ? n.chr : 355: n < 160 ? [UCD.cp1252[n] || n].pack('U') : 356: n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr 357: else 358: c 359: end 360: end.join 361: end
Used to translate an offset from bytes to characters, for instance one received from a regular expression match
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 312 312: def translate_offset(str, byte_offset) 313: return nil if byte_offset.nil? 314: return 0 if str == '' 315: chunk = str[0..byte_offset] 316: begin 317: begin 318: chunk.unpack('U*').length - 1 319: rescue ArgumentError => e 320: chunk = str[0..(byte_offset+=1)] 321: # Stop retrying at the end of the string 322: raise e unless byte_offset < chunk.length 323: # We damaged a character, retry 324: retry 325: end 326: # Catch the ArgumentError so we can throw our own 327: rescue ArgumentError 328: raise EncodingError.new('malformed UTF-8 character') 329: end 330: end
Convert characters in the string to uppercase
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 259 259: def upcase(str); to_case :uppercase_mapping, str; end
Compose decomposed characters to the composed form
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 494 494: def compose_codepoints(codepoints) 495: pos = 0 496: eoa = codepoints.length - 1 497: starter_pos = 0 498: starter_char = codepoints[0] 499: previous_combining_class = -1 500: while pos < eoa 501: pos += 1 502: lindex = starter_char - HANGUL_LBASE 503: # -- Hangul 504: if 0 <= lindex and lindex < HANGUL_LCOUNT 505: vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 506: if 0 <= vindex and vindex < HANGUL_VCOUNT 507: tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 508: if 0 <= tindex and tindex < HANGUL_TCOUNT 509: j = starter_pos + 2 510: eoa -= 2 511: else 512: tindex = 0 513: j = starter_pos + 1 514: eoa -= 1 515: end 516: codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE 517: end 518: starter_pos += 1 519: starter_char = codepoints[starter_pos] 520: # -- Other characters 521: else 522: current_char = codepoints[pos] 523: current = UCD[current_char] 524: if current.combining_class > previous_combining_class 525: if ref = UCD.composition_map[starter_char] 526: composition = ref[current_char] 527: else 528: composition = nil 529: end 530: unless composition.nil? 531: codepoints[starter_pos] = composition 532: starter_char = composition 533: codepoints.delete_at pos 534: eoa -= 1 535: pos -= 1 536: previous_combining_class = -1 537: else 538: previous_combining_class = current.combining_class 539: end 540: else 541: previous_combining_class = current.combining_class 542: end 543: if current.combining_class == 0 544: starter_pos = pos 545: starter_char = codepoints[pos] 546: end 547: end 548: end 549: codepoints 550: end
Decompose composed characters to the decomposed form
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 473 473: def decompose_codepoints(type, codepoints) 474: codepoints.inject([]) do |decomposed, cp| 475: # if it's a hangul syllable starter character 476: if HANGUL_SBASE <= cp and cp < HANGUL_SLAST 477: sindex = cp - HANGUL_SBASE 478: ncp = [] # new codepoints 479: ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT 480: ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT 481: tindex = sindex % HANGUL_TCOUNT 482: ncp << (HANGUL_TBASE + tindex) unless tindex == 0 483: decomposed.concat ncp 484: # if the codepoint is decomposable in with the current decomposition type 485: elsif (ncp = UCD[cp].decomp_mapping) and (!UCD[cp].decomp_type || type == :compatability) 486: decomposed.concat decompose_codepoints(type, ncp.dup) 487: else 488: decomposed << cp 489: end 490: end 491: end
Unpack the string at grapheme boundaries instead of codepoint boundaries
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 381 381: def g_unpack(str) 382: codepoints = u_unpack(str) 383: unpacked = [] 384: pos = 0 385: marker = 0 386: eoc = codepoints.length 387: while(pos < eoc) 388: pos += 1 389: previous = codepoints[pos-1] 390: current = codepoints[pos] 391: if ( 392: # CR X LF 393: one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or 394: # L X (L|V|LV|LVT) 395: two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or 396: # (LV|V) X (V|T) 397: three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or 398: # (LVT|T) X (T) 399: four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or 400: # X Extend 401: five = (UCD.boundary[:extend] === current) 402: ) 403: else 404: unpacked << codepoints[marker..pos-1] 405: marker = pos 406: end 407: end 408: unpacked 409: end
Detect whether the codepoint is in a certain character class. Primarily used by the grapheme cluster support.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 367 367: def in_char_class?(codepoint, classes) 368: classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false 369: end
Justifies a string in a certain way. Valid values for way are :right, :left and :center. Is primarily used as a helper method by rjust, ljust and center.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 418 418: def justify(str, integer, way, padstr=' ') 419: raise ArgumentError, "zero width padding" if padstr.length == 0 420: padsize = integer - size(str) 421: padsize = padsize > 0 ? padsize : 0 422: case way 423: when :right 424: str.dup.insert(0, padding(padsize, padstr)) 425: when :left 426: str.dup.insert(-1, padding(padsize, padstr)) 427: when :center 428: lpad = padding((padsize / 2.0).floor, padstr) 429: rpad = padding((padsize / 2.0).ceil, padstr) 430: str.dup.insert(0, lpad).insert(-1, rpad) 431: end 432: end
Generates a padding string of a certain size.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 435 435: def padding(padsize, padstr=' ') 436: if padsize != 0 437: slice(padstr * ((padsize / size(padstr)) + 1), 0, padsize) 438: else 439: '' 440: end 441: end
Re-order codepoints so the string becomes canonical
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 457 457: def reorder_characters(codepoints) 458: length = codepoints.length- 1 459: pos = 0 460: while pos < length do 461: cp1, cp2 = UCD[codepoints[pos]], UCD[codepoints[pos+1]] 462: if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) 463: codepoints[pos..pos+1] = cp2.code, cp1.code 464: pos += (pos > 0 ? -1 : 1) 465: else 466: pos += 1 467: end 468: end 469: codepoints 470: end
Convert characters to a different case
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 444 444: def to_case(way, str) 445: u_unpack(str).map do |codepoint| 446: cp = UCD[codepoint] 447: unless cp.nil? 448: ncp = cp.send(way) 449: ncp > 0 ? ncp : codepoint 450: else 451: codepoint 452: end 453: end.pack('U*') 454: end