Source for file unicode-defs.php

Documentation is available at unicode-defs.php

  1. <?php
  2. /* ******************************************************************** */
  3. /* CATALYST PHP Source Code */
  4. /* -------------------------------------------------------------------- */
  5. /* This program is free software; you can redistribute it and/or modify */
  6. /* it under the terms of the GNU General Public License as published by */
  7. /* the Free Software Foundation; either version 2 of the License, or */
  8. /* (at your option) any later version. */
  9. /* */
  10. /* This program is distributed in the hope that it will be useful, */
  11. /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
  12. /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
  13. /* GNU General Public License for more details. */
  14. /* */
  15. /* You should have received a copy of the GNU General Public License */
  16. /* along with this program; if not, write to: */
  17. /* The Free Software Foundation, Inc., 59 Temple Place, Suite 330, */
  18. /* Boston, MA 02111-1307 USA */
  19. /* -------------------------------------------------------------------- */
  20. /* */
  21. /* Filename: unicode-defs.php */
  22. /* Author: Paul Waite */
  23. /* Description: Various functions to help with Unicode conversion etc. */
  24. /* */
  25. /* ******************************************************************** */
  26. /** @package i18n *//**
  27. * takes a string of unicode entities and converts it to a utf-8 encoded string
  28. * each unicode entitiy has the form &#nnn(nn); n={0..9} and can be displayed by
  29. * utf-8 supporting browsers. Ascii will not be modified.
  30. * @param string $source String of unicode entities
  31. * @return string The utf-8 encoded string
  32. */
  33. function utf8encode($source) {
  34. $utf8Str = '';
  35. $entityArray = explode ("&#", $source);
  36. $size = count ($entityArray);
  37. for ($i = 0; $i < $size; $i++) {
  38. $subStr = $entityArray[$i];
  39. $nonEntity = strstr ($subStr, ';');
  40. if ($nonEntity !== false) {
  41. $unicode = intval (substr ($subStr, 0, (strpos ($subStr, ';') + 1)));
  42. // determine how many chars are needed to reprsent this unicode char
  43. if ($unicode < 128) {
  44. $utf8Substring = chr ($unicode);
  45. }
  46. else if ($unicode >= 128 && $unicode < 2048) {
  47. $binVal = str_pad (decbin ($unicode), 11, "0", STR_PAD_LEFT);
  48. $binPart1 = substr ($binVal, 0, 5);
  49. $binPart2 = substr ($binVal, 5);
  50.  
  51. $char1 = chr (192 + bindec ($binPart1));
  52. $char2 = chr (128 + bindec ($binPart2));
  53. $utf8Substring = $char1 . $char2;
  54. }
  55. else if ($unicode >= 2048 && $unicode < 65536) {
  56. $binVal = str_pad (decbin ($unicode), 16, "0", STR_PAD_LEFT);
  57. $binPart1 = substr ($binVal, 0, 4);
  58. $binPart2 = substr ($binVal, 4, 6);
  59. $binPart3 = substr ($binVal, 10);
  60.  
  61. $char1 = chr (224 + bindec ($binPart1));
  62. $char2 = chr (128 + bindec ($binPart2));
  63. $char3 = chr (128 + bindec ($binPart3));
  64. $utf8Substring = $char1 . $char2 . $char3;
  65. }
  66. else {
  67. $binVal = str_pad (decbin ($unicode), 21, "0", STR_PAD_LEFT);
  68. $binPart1 = substr ($binVal, 0, 3);
  69. $binPart2 = substr ($binVal, 3, 6);
  70. $binPart3 = substr ($binVal, 9, 6);
  71. $binPart4 = substr ($binVal, 15);
  72.  
  73. $char1 = chr (240 + bindec ($binPart1));
  74. $char2 = chr (128 + bindec ($binPart2));
  75. $char3 = chr (128 + bindec ($binPart3));
  76. $char4 = chr (128 + bindec ($binPart4));
  77. $utf8Substring = $char1 . $char2 . $char3 . $char4;
  78. }
  79.  
  80. if (strlen ($nonEntity) > 1)
  81. $nonEntity = substr ($nonEntity, 1); // chop the first char (';')
  82. else
  83. $nonEntity = '';
  84.  
  85. $utf8Str .= $utf8Substring . $nonEntity;
  86. }
  87. else {
  88. $utf8Str .= $subStr;
  89. }
  90. }
  91. return $utf8Str;
  92. } // utf8encode
  93. // -----------------------------------------------------
  94. /*
  95. * Returns true if the given string is UTF-8 compliant.
  96. * NB: this doesn't necessarily mean it IS encoded as
  97. * UTF-8 - it might just be an ASCII string.
  98. * @param string $ String to check for compliance
  99. * @return boolean True if string complies with UTF-8 format
  100. */
  101. function is_utf8($s) {
  102. for ($i = 0; $i < strlen($s); $i++) {
  103. $charOrd = ord($s[$i]);
  104. if ($charOrd < 0x80) {
  105. continue; # 0bbbbbbb
  106. }
  107. elseif (($charOrd & 0xE0) == 0xC0) $n=1; # 110bbbbb
  108. elseif (($charOrd & 0xF0) == 0xE0) $n=2; # 1110bbbb
  109. elseif (($charOrd & 0xF8) == 0xF0) $n=3; # 11110bbb
  110. elseif (($charOrd & 0xFC) == 0xF8) $n=4; # 111110bb
  111. elseif (($charOrd & 0xFE) == 0xFC) $n=5; # 1111110b
  112. else {
  113. # Does not match any model
  114. return false;
  115. }
  116. # n bytes matching 10bbbbbb follow ?
  117. for ($j = 0; $j < $n; $j++) {
  118. if ((++$i == strlen($s)) || ((ord($s[$i]) & 0xC0) != 0x80)) {
  119. return false;
  120. }
  121. }
  122. } // for
  123. return true;
  124. } // is_utf8
  125. // -----------------------------------------------------
  126. /*
  127. * Return the Unicode ordinal value of a UTF-8 character sequence.
  128. * @param string $c Multi-byte 'string' representing Unicode char
  129. * $return integer The ordinal Unicode code for this character
  130. */
  131. function utf8ord($c) {
  132. $uni = 0;
  133. if (ord($c{0})>=0 && ord($c{0})<=127) {
  134. $uni = $c{0};
  135. }
  136. elseif (ord($c{0})>=192 && ord($c{0})<=223) {
  137. $uni = (ord($c{0})-192)*64 + (ord($c{1})-128);
  138. }
  139. elseif (ord($c{0})>=224 && ord($c{0})<=239) {
  140. $uni = (ord($c{0})-224)*4096 + (ord($c{1})-128)*64 + (ord($c{2})-128);
  141. }
  142. elseif (ord($c{0})>=240 && ord($c{0})<=247) {
  143. $uni = (ord($c{0})-240)*262144 + (ord($c{1})-128)*4096 + (ord($c{2})-128)*64 + (ord($c{3})-128);
  144. }
  145. elseif (ord($c{0})>=248 && ord($c{0})<=251) {
  146. $uni = (ord($c{0})-248)*16777216 + (ord($c{1})-128)*262144 + (ord($c{2})-128)*4096 + (ord($c{3})-128)*64 + (ord($c{4})-128);
  147. }
  148. elseif (ord($c{0})>=252 && ord($c{0})<=253) {
  149. $uni = (ord($c{0})-252)*1073741824 + (ord($c{1})-128)*16777216 + (ord($c{2})-128)*262144 + (ord($c{3})-128)*4096 + (ord($c{4})-128)*64 + (ord($c{5})-128);
  150. }
  151. elseif (ord($c{0})>=254 && ord($c{0})<=255) {//error
  152. $uni = false;
  153. }
  154. return $uni;
  155. } // utf8ord
  156. // -----------------------------------------------------
  157.  
  158. /**
  159. * Ensure a string is encoded as UTF-8..
  160. */
  161. function utf8_ensure($s) {
  162. return is_utf8($s) ? $s: utf8_encode($s);
  163. } // utf8_ensure
  164. // -----------------------------------------------------
  165.  
  166. /**
  167. * RFC1738 compliant replacement to PHP's rawurldecode - which
  168. * actually works with unicode (using utf-8 encoding).
  169. * @param string $source The original string
  170. * @return string Unicode-safe rawurldecoded string
  171. */
  172. function utf8RawUrlDecode($source) {
  173. $decodedStr = '';
  174. $pos = 0;
  175. $len = strlen($source);
  176. while ($pos < $len) {
  177. $charAt = substr($source, $pos, 1);
  178. if ($charAt == '%') {
  179. $pos++;
  180. $charAt = substr($source, $pos, 1);
  181. if ($charAt == 'u') {
  182. // we got a unicode character
  183. $pos++;
  184. $unicodeHexVal = substr($source, $pos, 4);
  185. $unicode = hexdec($unicodeHexVal);
  186. $entity = "&#". $unicode . ';';
  187. $decodedStr .= utf8encode($entity);
  188. $pos += 4;
  189. }
  190. else {
  191. // we have an escaped ascii character
  192. $hexVal = substr($source, $pos, 2);
  193. $decodedStr .= chr(hexdec ($hexVal));
  194. $pos += 2;
  195. }
  196. }
  197. else {
  198. $decodedStr .= $charAt;
  199. $pos++;
  200. }
  201. }
  202. return $decodedStr;
  203. } // utf8RawUrlDecode
  204. // -----------------------------------------------------
  205.  
  206. /**
  207. * Replacement for PHP's rawurlencode. This version skips any existing
  208. * sequences of '%xx', which represent already-encoded chars. Also
  209. * uses the multi=byte string functions to preseve unicode chars
  210. * integrity.
  211. * @param string $str The string to URL encode
  212. * @return string The URL-encoded string
  213. */
  214. Function utf8RawUrlEncode($str) {
  215. $len = strlen($str);
  216. $res = "";
  217. $i = 0;
  218. $mb = function_exists("mb_substr");
  219. while ($i < $len) {
  220. if ($mb) $chk = mb_substr($str, $i, 3);
  221. else $chk = substr($str, $i, 3);
  222. if(preg_match("/%[0-9a-f]/i", $chk)) {
  223. $res .= $chk;
  224. $i += 3;
  225. }
  226. else {
  227. if ($mb) $charAt = mb_substr($str, $i, 1);
  228. else $charAt = substr($str, $i, 1);
  229. $charOrd = ord($charAt);
  230. if (($charOrd >= 65 && $charOrd <= 90)
  231. || ($charOrd >= 97 && $charOrd <= 122)
  232. || ($charOrd >= 48 && $charOrd <= 57)
  233. || ($charOrd == 33)
  234. || ($charOrd == 36)
  235. || ($charOrd == 95)) {
  236. // this is alphanumeric or $-_.+!*'(), which according
  237. // to RFC1738 we don't escape
  238. $res .= $charAt;
  239. }
  240. else {
  241. if (ord($charAt) >= 0x80 && is_utf8($charAt)) {
  242. $charOrd = utf8ord($charAt);
  243. $hexValStr = "%u" . sprintf("%04x", $charOrd);
  244. $res .= $hexValStr;
  245. }
  246. elseif ($charOrd > 0) {
  247. $res .= "%";
  248. $hexValStr = sprintf("%02x", $charOrd);
  249. $res .= $hexValStr;
  250. }
  251. }
  252. $i += 1;
  253. }
  254. } // while
  255. return $res;
  256. } // utf8RawUrlEncode
  257. // -----------------------------------------------------
  258.  
  259. ?>

Documentation generated by phpDocumentor 1.3.0RC3