yaffs.net Git - yaffs-website/blob - web/core/scripts/transliteration_data.php.txt

   1 <?php
   2
   3 /**
   4  * @file
   5  * Unifies formats of transliteration data from various sources.
   6  *
   7  * A few notes about this script:
   8  * - The functions in this file are NOT SECURE, because they use PHP functions
   9  *   like eval(). Absolutely do not run this script unless you trust the data
  10  *   files used for input.
  11  * - You will need to change the name of this file to remove the .txt extension
  12  *   before running it (it has been given this name so that you cannot run it
  13  *   by mistake). When you do that, move it out of your web root as well so
  14  *   that it cannot be run via a URL, and run the script via the PHP command
  15  *   at a command prompt.
  16  * - This script, depending on which portions of it you run, depends on having
  17  *   input data from various sources in sub-directories below where this file
  18  *   is located. The data inputs are as follows:
  19  *   - Existing Drupal Core transliteration data: Sub-directory 'data'; comes
  20  *     from core/lib/Drupal/Component/Transliteration/data
  21  *   - Midgardmvc data: Sub-directory 'utf8_to_ascii_db'; download from
  22  *     https://github.com/bergie/midgardmvc_helper_urlize/downloads
  23  *   - CPAN Text-Unidecode data: Sub-directory 'Unidecode'; download from
  24  *     http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
  25  *   - Node.js project: Sub-directory 'unidecoder_data'; download from
  26  *     https://github.com/bitwalker/stringex/downloads
  27  *   - JUnidecode project: Sub-directory 'junidecode'; download source from
  28  *     http://www.ippatsuman.com/projects/junidecode/index.html
  29  * - You will also need to make directory 'outdata' to hold output.
  30  * - If you plan to use the 'intl' data, you will also need to have the PECL
  31  *   packages 'yaml' and 'intl' installed.  See
  32  *   http://php.net/manual/install.pecl.downloads.php for generic PECL
  33  *   package installation instructions. The following commands on Ubuntu Linux
  34  *   will install yaml and intl packages:
  35  *   @code
  36  *   sudo apt-get install libyaml-dev
  37  *   sudo pecl install yaml
  38  *   sudo apt-get install php5-intl
  39  *   sudo apt-get install libicu-dev
  40  *   sudo pecl install intl
  41  *   @endcode
  42  *   After running these commands, you will need to make sure
  43  *   'extension=intl.so' and 'extension=yaml.so' are added to the php.ini file
  44  *   that is in use for the PHP command-line command.
  45  * - When you have collected all of the data and installed the required
  46  *   packages, you will need to find the specific commands below that you want
  47  *   to use and un-comment them. The preferred data source for Drupal Core is
  48  *   the PECL 'intl' package, and the line that needs to be un-commented in
  49  *   order to make a Drupal Core patch is:
  50  *   @code
  51  *   patch_drupal('outdata');
  52  *   @endcode
  53  * - The functions are documented in more detail in their headers where they
  54  *   are defined. Many have parameters that you can use to change the output.
  55  */
  56
  57 // Commands to read various data sources:
  58 // $data = read_drupal_data();
  59 // $data = read_midgard_data();
  60 // $data = read_cpan_data();
  61 // $data = read_nodejs_data();
  62 // $data = read_intl_data();
  63 // $data = read_junidecode_data();
  64
  65 // After running a read_*_data() function, you can print out the data
  66 // (it will make a LOT of output):
  67 // print_r($data);
  68
  69 // Command to read in all of data sources and output in CSV format, explaining
  70 // the differences:
  71 // read_all_to_csv();
  72
  73 // Command to patch Drupal Core data, using the intl data set, and put the
  74 // resulting changed data files in the 'outdata' directory:
  75 patch_drupal('outdata');
  76
  77 /**
  78  * Reads in all transliteration data and outputs differences in CSV format.
  79  *
  80  * Each data set is compared to the Drupal Core reference data set, and the
  81  * differences are noted. The data must be in the locations noted in the
  82  * file header above. The CSV output has several columns. The first one is the
  83  * Unicode character code. The next columns contain the transliteration of
  84  * that character in each of the data sets. The last column, tells what the
  85  * differences are between the Drupal Core reference set and the other data
  86  * sets:
  87  * - missing: The target set is missing data that the Drupal set has.
  88  * - provided: The target set has provided data that Drupal does not have.
  89  * - case: The target and Drupal set output differ only in upper/lower case.
  90  * - different: The target and Drupal set output differ in more than just case.
  91  *
  92  * @param bool $print_all
  93  *   TRUE to print all data; FALSE (default) to print just data where there
  94  *   are differences between the Drupal set and other data sources.
  95  * @param bool $print_missing
  96  *   TRUE to print cases where one of the non-Drupal sets is missing information
  97  *   and that is the only difference; FALSE (default) to include these rows.
  98  */
  99 function read_all_to_csv($print_all = FALSE, $print_missing = FALSE) {
 100   $data = array();
 101   $types = array('drupal', 'midgard', 'cpan', 'nodejs', 'junidecode', 'intl');
 102
 103   // Alternatively, if you just want to compare a couple of data sets, you can
 104   // uncomment and edit the following line:
 105   // $types = array('drupal', 'intl');
 106
 107   // Read in all the data.
 108   foreach ($types as $type) {
 109     $data[$type] = call_user_func('read_' . $type . '_data');
 110   }
 111
 112   // Print CSV header row.
 113   print "character,";
 114   print implode(',', $types);
 115   print ",why\n";
 116
 117   // Go through all the banks of character data.
 118   for ($bank = 0; $bank < 256; $bank++) {
 119
 120     // Go through characters in bank; skip pure ASCII characters.
 121     $start = ($bank == 0) ? 0x80 : 0;
 122     for ($chr = $start; $chr < 256; $chr++) {
 123
 124       // Gather the data together for this character.
 125       $row = array();
 126       foreach ($types as $type) {
 127         $row[$type] = (isset($data[$type][$bank][$chr]) && is_string($data[$type][$bank][$chr])) ? $data[$type][$bank][$chr] : '';
 128       }
 129
 130       // Only print if there are differences or we are printing all data.
 131       $print = $print_all;
 132       $ref = $row['drupal'];
 133       $why = array();
 134       foreach ($types as $type) {
 135         // Try to characterize what the differences are.
 136         if ($row[$type] != $ref) {
 137           if ($row[$type] == '') {
 138             $why['missing'] = 'missing';
 139             if ($print_missing) {
 140               $print = TRUE;
 141             }
 142           }
 143           elseif ($ref == '') {
 144             $why['provided'] = 'provided';
 145             $print = TRUE;
 146           }
 147           elseif ($row[$type] == strtolower($ref) || $row[$type] == strtoupper($ref)) {
 148             $why['case'] = 'case';
 149             $print = TRUE;
 150           }
 151           else {
 152             $why['different'] = 'different';
 153             $print = TRUE;
 154           }
 155         }
 156       }
 157
 158       // Print the data line.
 159       if ($print) {
 160         print '0x' . sprintf('%04x', 256 * $bank + $chr) . ',';
 161         foreach ($row as $out) {
 162           print '"' . addcslashes($out, '"') . '", ';
 163         }
 164         print implode(':', $why);
 165         print "\n";
 166       }
 167     }
 168   }
 169 }
 170
 171 /**
 172  * Reads in 'intl' transliteration data and writes out changed Drupal files.
 173  *
 174  * Writes out the Drupal data files that would have to change to make our data
 175  * match the intl data set.
 176  *
 177  * @param string $outdir
 178  *   Directory to put the patched data files in (under where the script is
 179  *   being run).
 180  */
 181 function patch_drupal($outdir) {
 182   $data = array();
 183
 184   // Note that this is hard-wired below. Changing this line will have no
 185   // effect except to break this function.
 186   $types = array('drupal', 'intl');
 187
 188   // Read in all the data.
 189   foreach ($types as $type) {
 190     $data[$type] = call_user_func('read_' . $type . '_data');
 191   }
 192
 193   // Go through all the banks of character data.
 194   for ($bank = 0; $bank < 256; $bank++) {
 195     $print_bank = FALSE;
 196
 197     // Go through characters in bank; skip pure ASCII characters.
 198     $start = ($bank == 0) ? 0x80 : 0;
 199     $newdata = array();
 200     for ($chr = 0; $chr < 256; $chr++) {
 201       // Fill up the start of the ASCII range.
 202       if ($chr < $start) {
 203         $newdata[$chr] = chr($chr);
 204         continue;
 205       }
 206
 207       // Figure out what characters we actually have.
 208       $drupal = isset($data['drupal'][$bank][$chr]) ? $data['drupal'][$bank][$chr] : NULL;
 209       // Note that for intl, we only want to keep the transliteration if it
 210       // has something other than '' in it.
 211       $intl = isset($data['intl'][$bank][$chr]) && $data['intl'][$bank][$chr] != '' ? $data['intl'][$bank][$chr] : NULL;
 212       // Make sure we have something in the Drupal data set, in case we need
 213       // to print.
 214       $newdata[$chr] = $drupal;
 215
 216       if (!isset($intl)) {
 217         continue;
 218       }
 219       if (!isset($drupal) || $drupal != $intl) {
 220         $print_bank = TRUE;
 221         $newdata[$chr] = $intl;
 222       }
 223     }
 224
 225     // If we found a difference, output a data file.
 226     if ($print_bank) {
 227       write_data_file($newdata, $bank, $outdir);
 228     }
 229   }
 230 }
 231
 232 /**
 233  * Reads in the Drupal Core generic transliteration data set.
 234  *
 235  * The data is expected to be in files xNN.php in directory 'data' under
 236  * this file's directory.
 237  *
 238  * @return array
 239  *   Nested array of transliteration data. Outer keys are the first two
 240  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
 241  *   level is the other two bytes, and the values are the transliterations.
 242  *
 243  * @see PhpTransliteration::readGenericData()
 244  */
 245 function read_drupal_data() {
 246   $dir = __DIR__ . '/data';
 247   $out = array();
 248
 249   // Read data files.
 250   for ($bank = 0; $bank < 256; $bank++) {
 251     $base = array();
 252     $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
 253     if (is_file($file)) {
 254       include($file);
 255     }
 256     $out[$bank] = $base;
 257   }
 258
 259   return $out;
 260 }
 261
 262 /**
 263  * Reads in the MidgardMVC transliteration data.
 264  *
 265  * The data is expected to be in files xNN.php in directory utf8_to_ascii_db
 266  * under the directory where this file resides. It can be downloaded from
 267  * https://github.com/bergie/midgardmvc_helper_urlize/downloads.
 268  *
 269  * @return array
 270  *   Nested array of transliteration data. Outer keys are the first two
 271  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
 272  *   level is the other two bytes, and the values are the transliterations.
 273  */
 274 function read_midgard_data() {
 275   $dir = __DIR__ . '/utf8_to_ascii_db';
 276   $out = array();
 277
 278   // Read data files.
 279   for ($bank = 0; $bank < 256; $bank++) {
 280     $UTF8_TO_ASCII = array($bank => array());
 281     $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
 282     if (is_file($file)) {
 283       include($file);
 284     }
 285     $base = $UTF8_TO_ASCII[$bank];
 286
 287     // For unknown characters, these files have '[?]' in them. Replace with
 288     // NULL for compatibility with our data.
 289     $base = array_map('_replace_question_with_null', $base);
 290     $out[$bank] = $base;
 291   }
 292
 293   return $out;
 294 }
 295
 296 /**
 297  * Reads in the CPAN Text::Unidecode data set.
 298  *
 299  * The data is expected to be in files xNN.pm in directory 'Unidecode' under
 300  * this file's directory. It can be downloaded from
 301  * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm.
 302  *
 303  * @return array
 304  *   Nested array of transliteration data. Outer keys are the first two
 305  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
 306  *   level is the other two bytes, and the values are the transliterations.
 307  */
 308 function read_cpan_data() {
 309   $dir = __DIR__ . '/Unidecode';
 310   $out = array();
 311
 312   // Read data files.
 313   for ($bank = 0; $bank < 256; $bank++) {
 314     $base = array();
 315     $file = $dir . '/x' . sprintf('%02x', $bank) . '.pm';
 316     if (is_file($file)) {
 317       $base = _cpan_read_file($file);
 318     }
 319     $out[$bank] = $base;
 320   }
 321
 322   return $out;
 323 }
 324
 325 /**
 326  * Reads in the data in a single file from the Text::Unidecode CPAN project.
 327  *
 328  * @param string $file
 329  *   File to read from.
 330  *
 331  * @return array
 332  *   Data read from the file.
 333  *
 334  * @see read_cpan_data()
 335  */
 336 function _cpan_read_file($file) {
 337
 338   $contents = file($file);
 339   $save = '';
 340   foreach ($contents as $line) {
 341     // Discard lines starting with # or $. The first line seems to have a
 342     // comment starting with #, the second has a Perl line like
 343     // $Text::Unidecode::Char[0x04] = [, -- and we do not want either.
 344     if (preg_match('|^\s*[#\$]|', $line)) {
 345       continue;
 346     }
 347
 348     // Discard lines ending with semi-colons, which we also don't want
 349     // (there seem to be two of these lines at the end of the files).
 350     if (preg_match('|;\s*$|', $line)) {
 351       continue;
 352     }
 353
 354     // Replace '[?]' with nothing (that means "don't know how to
 355     // transliterate"). In some files, this is encoded as qq{[?]} or
 356     // qq{[?] } instead.
 357     $line = str_replace('qq{[?]}', 'NULL', $line);
 358     $line = str_replace('qq{[?] }', 'NULL', $line);
 359     $line = str_replace("'[?]'", 'NULL', $line);
 360
 361     // Replace qq{} with either "" or '' or nothing, depending on what is
 362     // inside it.
 363     $line = str_replace('qq{\{}', "'{'", $line);
 364     $line = str_replace('qq{\}}', "'}'", $line);
 365     $line = str_replace('qq{\} }', "'} '", $line);
 366     $line = str_replace("qq{\\\\}", '"\\\\"', $line);
 367     $line = str_replace("qq{\\", "qq{'", $line);
 368     $line = str_replace("qq{\"'}", "\"\\\"'\"", $line);
 369     $line = preg_replace('|qq\{([^\'\}]+)\}|', "'$1'", $line);
 370     $line = preg_replace('|qq\{([^\}]+)\}|', '"$1"', $line);
 371
 372     $save .= $line;
 373   }
 374
 375   // Now we should have a string that looks like:
 376   // 'a', 'b', ...
 377   // Evaluate as an array.
 378   $save = 'return array(' . $save . ');';
 379
 380   $data = @eval($save);
 381   if (isset($data) && is_array($data)) {
 382     $data = array_map('_replace_hex_with_character', $data);
 383   }
 384   else {
 385     // There was a problem, so throw an error and exit.
 386     print "Problem in evaluating $file\n";
 387     print $save;
 388     eval($save);
 389     exit();
 390   }
 391
 392   // For unknown characters, these files may still have '[?]' in them. Replace
 393   // with NULL for compatibility with our data.
 394   $data = array_map('_replace_question_with_null', $data);
 395
 396   return $data;
 397 }
 398
 399 /**
 400  * Reads in the Node.js transliteration data.
 401  *
 402  * The data is expected to be in files xNN.yml in directory unidecoder_data
 403  * under the directory where this file resides. It can be downloaded from
 404  * https://github.com/bitwalker/stringex/downloads. You also need the PECL
 405  * 'yaml' extension installed for this function to work.
 406  *
 407  * @return array
 408  *   Nested array of transliteration data. Outer keys are the first two
 409  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
 410  *   level is the other two bytes, and the values are the transliterations.
 411  */
 412 function read_nodejs_data() {
 413   $dir = __DIR__ . '/unidecoder_data';
 414   $out = array();
 415
 416   // Read data files.
 417   for ($bank = 0; $bank < 256; $bank++) {
 418     $base = array();
 419     $file = $dir . '/x' . sprintf('%02x', $bank) . '.yml';
 420     if (is_file($file)) {
 421       $base = yaml_parse_file($file);
 422       // For unknown characters, these files have '[?]' in them. Replace with
 423       // NULL for compatibility with our data.
 424       $base = array_map('_replace_question_with_null', $base);
 425     }
 426     $out[$bank] = $base;
 427   }
 428
 429   return $out;
 430 }
 431
 432 /**
 433  * Loads the PECL 'intl' Transliterator class's transliteration data.
 434  *
 435  * You need to have the PECL 'intl' package installed for this to work.
 436  *
 437  * @return array
 438  *   Nested array of transliteration data. Outer keys are the first two
 439  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
 440  *   level is the other two bytes, and the values are the transliterations.
 441  */
 442 function read_intl_data() {
 443   // In order to transliterate, you first have to create a transliterator
 444   // object. This needs a list of transliteration operations. You can get a
 445   // list of available operations with:
 446   //   print_r(Transliterator::listIDs()); exit();
 447   // And a few of these are documented on
 448   // http://userguide.icu-project.org/transforms/general and
 449   // http://www.unicode.org/reports/tr15/ (for normalizations).
 450   // There are also maps to the Unicode characters at:
 451   //  http://www.unicode.org/roadmaps/bmp/
 452   //  http://www.unicode.org/charts/nameslist/
 453   $ops = '';
 454
 455   // The first step in any transform: separate out accents and remove them.
 456   $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;';
 457
 458   // Then you need to do a bunch of language-specific or script-specific
 459   // transliterations. Here is hopefully a representative set. There are
 460   // quite a few scripts that don't appear to have rules currently, such
 461   // as Etheopian.
 462   $ops .= 'Greek-Latin; ';
 463   $ops .= 'Cyrillic-Latin; ';
 464   $ops .= 'Armenian-Latin; ';
 465   $ops .= 'Hebrew-Latin; ';
 466   $ops .= 'Arabic-Latin; ';
 467   $ops .= 'Syriac-Latin; ';
 468   $ops .= 'Thaana-Latin; ';
 469   $ops .= 'Devanagari-Latin; ';
 470   $ops .= 'Bengali-Latin; ';
 471   $ops .= 'Gurmukhi-Latin; ';
 472   $ops .= 'Gujarati-Latin; ';
 473   $ops .= 'Oriya-Latin; ';
 474   $ops .= 'Tamil-Latin; ';
 475   $ops .= 'Telugu-Latin; ';
 476   $ops .= 'Kannada-Latin; ';
 477   $ops .= 'Malayalam-Latin; ';
 478   $ops .= 'Thai-Latin; ';
 479   $ops .= 'Georgian-Latin; ';
 480   $ops .= 'Hangul-Latin; ';
 481   $ops .= 'Mongolian-Latin/BGN; ';
 482   $ops .= 'Jamo-Latin; ';
 483   $ops .= 'Katakana-Latin; ';
 484   $ops .= 'Any-Latin; ';
 485
 486   // Finally, after transforming to Latin, transform to ASCII.
 487   $ops .= 'Latin-ASCII; ';
 488
 489   // Remove any remaining accents and recompose.
 490   $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;';
 491
 492   $trans = Transliterator::create($ops);
 493   $out = array();
 494
 495   // Transliterate all possible characters.
 496   for ($bank = 0; $bank < 256; $bank++) {
 497     $data = array();
 498     for ($chr = 0; $chr < 256; $chr++) {
 499       // Skip the UTF-16 and "private use" ranges completely.
 500       $OK = ($bank <= 0xd8 || $bank > 0xf8);
 501
 502       $result = $OK ? $trans->transliterate(mb_convert_encoding(pack('n', 256 * $bank + $chr), 'UTF-8', 'UTF-16BE')) : '';
 503
 504       // See if we have managed to transliterate this to ASCII or not. If not,
 505       // return NULL instead of this character.
 506       $max = chr(127);
 507       foreach (preg_split('//u', $result, 0, PREG_SPLIT_NO_EMPTY) as $character) {
 508         if ($character > $max) {
 509           $OK = $OK && FALSE;
 510           break;
 511         }
 512       }
 513       $data[$chr] = ($OK) ? $result : NULL;
 514     }
 515     $out[$bank] = $data;
 516   }
 517
 518   return $out;
 519 }
 520
 521 /**
 522  * Reads in the JUnidecode data set.
 523  *
 524  * The data is expected to be in files XNN.java in directory 'junidecode' under
 525  * this file's directory. It can be downloaded from
 526  * http://www.ippatsuman.com/projects/junidecode/index.html
 527  *
 528  * @return array
 529  *   Nested array of transliteration data. Outer keys are the first two
 530  *   bytes of Unicode characters (or 0 for base ASCII characters). The next
 531  *   level is the other two bytes, and the values are the transliterations.
 532  */
 533 function read_junidecode_data() {
 534   $dir = __DIR__ . '/junidecode';
 535   $out = array();
 536
 537   // Read data files.
 538   for ($bank = 0; $bank < 256; $bank++) {
 539     $base = array();
 540     $file = $dir . '/X' . sprintf('%02x', $bank) . '.java';
 541     if (is_file($file)) {
 542       $base = _junidecode_read_file($file);
 543     }
 544     $out[$bank] = $base;
 545   }
 546
 547   return $out;
 548 }
 549
 550 /**
 551  * Reads in the data in a single file from the JUnidecode project.
 552  *
 553  * @param string $file
 554  *   File to read from.
 555  *
 556  * @return array
 557  *   Data read from the file.
 558  *
 559  * @see read_junidecode_data()
 560  */
 561 function _junidecode_read_file($file) {
 562   $contents = file($file);
 563   $save = '';
 564   foreach ($contents as $line) {
 565     // Discard lines starting with * or / or package or class or public or },
 566     // to get rid of comments and Java code.
 567     if (preg_match('|^\s*[\*/\}]|', $line)) {
 568       continue;
 569     }
 570     if (preg_match('/^\s*package|public|class/', $line)) {
 571       continue;
 572     }
 573
 574     // Some of the lines look like this:
 575     //      new String("" + (char) 0x00), // 0x00
 576     // Transform to be '0x00,'
 577     $line = preg_replace('|^\s*new\s+String\s*\(\s*""\s*\+\s*\(char\)\s+0x([0-9]+).*$|', '0x$1,', $line);
 578
 579     // Strings are in double quotes, yet many have \' in them.
 580     $line = str_replace("\'", "'", $line);
 581
 582     // Everything else should probably be OK -- the lines are like:
 583     //  "Ie", // 0x00
 584     $save .= $line;
 585   }
 586
 587   // Evaluate as an array.
 588   $save = 'return array(' . $save . ');';
 589
 590   $data = @eval($save);
 591   if (isset($data) && is_array($data)) {
 592     $data = array_map('_replace_hex_with_character', $data);
 593     $data = array_map('_replace_question_with_null', $data);
 594   }
 595   else {
 596     // There was a problem, so throw an error and exit.
 597     print "Problem in evaluating $file\n";
 598     print $save;
 599     eval($save);
 600     exit();
 601   }
 602
 603   return $data;
 604 }
 605
 606 /**
 607  * Callback for array_map(): Returns $data, with '[?]' replaced with NULL.
 608  */
 609 function _replace_question_with_null($data) {
 610   return ($data == '[?]' || $data == '[?] ') ? NULL : $data;
 611 }
 612
 613 /**
 614  * Callback for array_map(): Replaces '\xNN' with the actual character.
 615  */
 616 function _replace_hex_with_character($item) {
 617   if (strpos($item, '\x') === 0) {
 618     $item = eval($item);
 619   }
 620   return $item;
 621 }
 622
 623 /**
 624  * Writes a data file out in the standard Drupal Core data format.
 625  *
 626  * @param array $data
 627  *   Array of data to write out.
 628  * @param string $bank
 629  *   Bank of characters it belongs to.
 630  * @param string $dir
 631  *   Output directory.
 632  */
 633 function write_data_file($data, $bank, $outdir) {
 634   $dir = __DIR__ . '/' . $outdir;
 635   $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
 636
 637   $out = '';
 638   $out .= "<?php\n\n/**\n * @file\n * Generic transliteration data for the PhpTransliteration class.\n */\n\n\$base = array(\n";
 639
 640   // The 00 file skips the ASCII range
 641   $start = 0;
 642   if ($bank == 0) {
 643     $start = 0x80;
 644     $out .= "  // Note: to save memory plain ASCII mappings have been left out.\n";
 645   }
 646
 647   for ($line = $start; $line <= 0xf0; $line += 0x10) {
 648     $out .= '  0x' . sprintf('%02X', $line) . ' =>';
 649     $elems = array_values(array_slice($data, $line, 16));
 650     for ($i = 0; $i < 16; $i++ ) {
 651       if (isset($elems[$i])) {
 652         $out .= " '" . addcslashes($elems[$i], "'\\") . "',";
 653       }
 654       else {
 655         $out .= ' NULL,';
 656       }
 657     }
 658     $out .= "\n";
 659   }
 660
 661   $out .= ");\n";
 662
 663   file_put_contents($file, $out);
 664 }