Find this useful? Enter your email to receive occasional updates for securing PHP code.
Signing you up...
Thank you for signing up!
PHP Decode
#!/usr/bin/env php <?php error_reporting(E_ALL); /** * This is based on the ucgendat.c f..
Decoded Output download
#!/usr/bin/env php
<?php error_reporting(E_ALL);
/**
* This is based on the ucgendat.c file from the OpenLDAP project, licensed as
* follows. This file is not necessary to build PHP. It's only necessary to
* rebuild unicode_data.h and eaw_width.h from Unicode ucd files.
*
* Example usage:
* php ucgendat.php path/to/Unicode/data/files
*/
/* Copyright 1998-2007 The OpenLDAP Foundation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available at
* <http://www.OpenLDAP.org/license.html>.
*/
/* Copyright 2001 Computing Research Labs, New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
if ($argc < 2) {
echo "Usage: php ucgendata.php ./datadir
";
echo "./datadir must contain:
";
echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt
";
return;
}
$dir = $argv[1];
$unicodeDataFile = $dir . '/UnicodeData.txt';
$caseFoldingFile = $dir . '/CaseFolding.txt';
$specialCasingFile = $dir . '/SpecialCasing.txt';
$derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
$eastAsianWidthFile = $dir . '/EastAsianWidth.txt';
$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile];
foreach ($files as $file) {
if (!file_exists($file)) {
echo "File $file does not exist.
";
return;
}
}
$outputFile = __DIR__ . "/../unicode_data.h";
$data = new UnicodeData;
parseUnicodeData($data, file_get_contents($unicodeDataFile));
parseCaseFolding($data, file_get_contents($caseFoldingFile));
parseSpecialCasing($data, file_get_contents($specialCasingFile));
parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
file_put_contents($outputFile, generateData($data));
$eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h";
$eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile));
file_put_contents($eawFile, generateEastAsianWidthData($eawData));
class Range {
public $start;
public $end;
public function __construct(int $start, int $end) {
$this->start = $start;
$this->end = $end;
}
}
class UnicodeData {
public $propIndexes;
public $numProps;
public $propRanges;
public $caseMaps;
public $extraCaseData;
public function __construct() {
/*
* List of properties expected to be found in the Unicode Character Database.
*/
$this->propIndexes = array_flip([
"Mn", "Mc", "Me", "Nd", "Nl", "No",
"Zs", "Zl", "Zp", "Cs", "Co", "Cn",
"Lu", "Ll", "Lt", "Lm", "Lo", "Sm",
"Sc", "Sk", "So", "L", "R", "EN",
"ES", "ET", "AN", "CS", "B", "S",
"WS", "ON", "AL",
"C", "P", "Cased", "Case_Ignorable"
]);
$this->numProps = count($this->propIndexes);
$this->propRanges = array_fill(0, $this->numProps, []);
$this->caseMaps = [
'upper' => [],
'lower' => [],
'title' => [],
'fold' => [],
];
$this->extraCaseData = [];
}
function propToIndex(string $prop) : int {
/* Deal with directionality codes introduced in Unicode 3.0. */
if (in_array($prop, ["BN", "NSM", "PDF", "LRE", "LRO", "RLE", "RLO", "LRI", "RLI", "FSI", "PDI"])) {
/*
* Mark all of these as Other Neutral to preserve compatibility with
* older versions.
*/
$prop = "ON";
}
/* Merge all punctuation into a single category for efficiency of access.
* We're currently not interested in distinguishing different kinds of punctuation. */
if (in_array($prop, ["Pc", "Pd", "Ps", "Pe", "Po", "Pi", "Pf"])) {
$prop = "P";
}
/* Same for control. */
if (in_array($prop, ["Cc", "Cf"])) {
$prop = "C";
}
if (!isset($this->propIndexes[$prop])) {
throw new Exception("Unknown property $prop");
}
return $this->propIndexes[$prop];
}
public function addProp(int $code, string $prop) {
$propIdx = self::propToIndex($prop);
// Check if this extends the last range
$ranges = $this->propRanges[$propIdx];
if (!empty($ranges)) {
$lastRange = $ranges[count($ranges) - 1];
if ($code === $lastRange->end + 1) {
$lastRange->end++;
return;
}
}
$this->propRanges[$propIdx][] = new Range($code, $code);
}
public function addPropRange(int $startCode, int $endCode, string $prop) {
$propIdx = self::propToIndex($prop);
$this->propRanges[$propIdx][] = new Range($startCode, $endCode);
}
public function addCaseMapping(string $case, int $origCode, int $mappedCode) {
$this->caseMaps[$case][$origCode] = $mappedCode;
}
public function compactRangeArray(array $ranges) : array {
// Sort by start codepoint
usort($ranges, function (Range $r1, Range $r2) {
return $r1->start <=> $r2->start;
});
$lastRange = new Range(-1, -1);
$newRanges = [];
foreach ($ranges as $range) {
if ($lastRange->end == -1) {
$lastRange = $range;
} else if ($range->start == $lastRange->end + 1) {
$lastRange->end = $range->end;
} else if ($range->start > $lastRange->end + 1) {
$newRanges[] = $lastRange;
$lastRange = $range;
} else {
throw new Exception(sprintf(
"Overlapping ranges [%x, %x] and [%x, %x]",
$lastRange->start, $lastRange->end,
$range->start, $range->end
));
}
}
if ($lastRange->end != -1) {
$newRanges[] = $lastRange;
}
return $newRanges;
}
public function compactPropRanges() {
foreach ($this->propRanges as &$ranges) {
$ranges = $this->compactRangeArray($ranges);
}
}
}
function parseDataFile(string $input) {
$lines = explode("
", $input);
foreach ($lines as $line) {
// Strip comments
if (false !== $hashPos = strpos($line, '#')) {
$line = substr($line, 0, $hashPos);
}
// Skip empty lines
$line = trim($line);
if ($line === '') {
continue;
}
$fields = array_map('trim', explode(';', $line));
yield $fields;
}
}
function parseUnicodeData(UnicodeData $data, string $input) : void {
$lines = parseDataFile($input);
foreach ($lines as $fields) {
if (count($fields) != 15) {
throw new Exception("Line does not contain 15 fields");
}
$code = intval($fields[0], 16);
$name = $fields[1];
if ($name === '') {
throw new Exception("Empty name");
}
if ($name[0] === '<' && $name !== '<control>') {
// This is a character range
$lines->next();
$nextFields = $lines->current();
$nextCode = intval($nextFields[0], 16);
$generalCategory = $fields[2];
$data->addPropRange($code, $nextCode, $generalCategory);
$bidiClass = $fields[4];
$data->addPropRange($code, $nextCode, $bidiClass);
continue;
}
$generalCategory = $fields[2];
$data->addProp($code, $generalCategory);
$bidiClass = $fields[4];
$data->addProp($code, $bidiClass);
$upperCase = intval($fields[12], 16);
$lowerCase = intval($fields[13], 16);
$titleCase = intval($fields[14], 16) ?: $upperCase;
if ($upperCase) {
$data->addCaseMapping('upper', $code, $upperCase);
}
if ($lowerCase) {
$data->addCaseMapping('lower', $code, $lowerCase);
}
if ($titleCase) {
$data->addCaseMapping('title', $code, $titleCase);
}
}
}
function parseCodes(string $strCodes) : array {
$codes = [];
foreach (explode(' ', $strCodes) as $strCode) {
$codes[] = intval($strCode, 16);
}
return $codes;
}
function parseCaseFolding(UnicodeData $data, string $input) : void {
foreach (parseDataFile($input) as $fields) {
if (count($fields) != 4) {
throw new Exception("Line does not contain 4 fields");
}
$code = intval($fields[0], 16);
$status = $fields[1];
if ($status == 'T') {
// Use language-agnostic case folding
continue;
}
if ($status == 'C' || $status == 'S') {
$foldCode = intval($fields[2], 16);
if (!isset($data->caseMaps['fold'][$code])) {
$data->addCaseMapping('fold', $code, $foldCode);
} else {
// Add simple mapping to full mapping data
assert(is_array($data->caseMaps['fold'][$code]));
$data->caseMaps['fold'][$code][0] = $foldCode;
}
} else if ($status == 'F') {
$foldCodes = parseCodes($fields[2]);
$existingFoldCode = $data->caseMaps['fold'][$code] ?? $code;
$data->caseMaps['fold'][$code] = array_merge([$code], $foldCodes);
} else {
assert(0);
}
}
}
function addSpecialCasing(UnicodeData $data, string $type, int $code, array $caseCodes) : void {
$simpleCaseCode = $data->caseMaps[$type][$code] ?? $code;
if (count($caseCodes) == 1) {
if ($caseCodes[0] != $simpleCaseCode) {
throw new Exception("Simple case code in special casing does not match");
}
// Special case: If a title-case character maps to itself, we may still have to store it,
// if there is a non-trivial upper-case mapping for it
if ($type == 'title' && $code == $caseCodes[0]
&& ($data->caseMaps['upper'][$code] ?? $code) != $code) {
$data->caseMaps['title'][$code] = $code;
}
return;
}
if (count($caseCodes) > 3) {
throw new Exception("Special case mapping with more than 3 code points");
}
$data->caseMaps[$type][$code] = array_merge([$simpleCaseCode], $caseCodes);
}
function parseSpecialCasing(UnicodeData $data, string $input) : void {
foreach (parseDataFile($input) as $fields) {
if (count($fields) != 5 && count($fields) != 6) {
throw new Exception("Line does not contain 5 or 6 fields");
}
$code = intval($fields[0], 16);
$lower = parseCodes($fields[1]);
$title = parseCodes($fields[2]);
$upper = parseCodes($fields[3]);
$cond = $fields[4];
if ($cond) {
// Only use unconditional mappings
continue;
}
addSpecialCasing($data, 'lower', $code, $lower);
addSpecialCasing($data, 'upper', $code, $upper);
// Should happen last
addSpecialCasing($data, 'title', $code, $title);
}
}
function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
foreach (parseDataFile($input) as $fields) {
if (count($fields) != 2) {
throw new Exception("Line does not contain 2 fields");
}
$property = $fields[1];
if ($property != 'Cased' && $property != 'Case_Ignorable') {
continue;
}
$range = explode('..', $fields[0]);
if (count($range) == 2) {
$data->addPropRange(intval($range[0], 16), intval($range[1], 16), $property);
} else if (count($range) == 1) {
$data->addProp(intval($range[0], 16), $property);
} else {
throw new Exception("Invalid range");
}
}
}
function parseEastAsianWidth(string $input) : array {
$wideRanges = [];
foreach (parseDataFile($input) as $fields) {
if ($fields[1] == 'W' || $fields[1] == 'F') {
if ($dotsPos = strpos($fields[0], '..')) {
$startCode = intval(substr($fields[0], 0, $dotsPos), 16);
$endCode = intval(substr($fields[0], $dotsPos + 2), 16);
if (!empty($wideRanges)) {
$lastRange = $wideRanges[count($wideRanges) - 1];
if ($startCode == $lastRange->end + 1) {
$lastRange->end = $endCode;
continue;
}
}
$wideRanges[] = new Range($startCode, $endCode);
} else {
$code = intval($fields[0], 16);
if (!empty($wideRanges)) {
$lastRange = $wideRanges[count($wideRanges) - 1];
if ($code == $lastRange->end + 1) {
$lastRange->end++;
continue;
}
}
$wideRanges[] = new Range($code, $code);
}
}
}
return $wideRanges;
}
function formatArray(array $values, int $width, string $format) : string {
$result = '';
$i = 0;
$c = count($values);
for ($i = 0; $i < $c; $i++) {
if ($i != 0) {
$result .= ',';
}
$result .= $i % $width == 0 ? "
" : " ";
$result .= sprintf($format, $values[$i]);
}
return $result;
}
function formatShortHexArray(array $values, int $width) : string {
return formatArray($values, $width, "0x%04x");
}
function formatShortDecArray(array $values, int $width) : string {
return formatArray($values, $width, "% 5d");
}
function formatIntArray(array $values, int $width) : string {
return formatArray($values, $width, "0x%08x");
}
function generatePropData(UnicodeData $data) {
$data->compactPropRanges();
$propOffsets = [];
$idx = 0;
foreach ($data->propRanges as $ranges) {
$num = count($ranges);
$propOffsets[] = $idx;
$idx += 2*$num;
}
// Add sentinel for binary search
$propOffsets[] = $idx;
// TODO ucgendat.c pads the prop offsets to the next multiple of 4
// for rather dubious reasons of alignment. This should probably be
// dropped
while (count($propOffsets) % 4 != 0) {
$propOffsets[] = 0;
}
$totalRanges = $idx;
$result = "";
$result .= "static const unsigned short _ucprop_size = $data->numProps;
";
$result .= "static const unsigned short _ucprop_offsets[] = {";
$result .= formatShortHexArray($propOffsets, 8);
$result .= "
};
";
$values = [];
foreach ($data->propRanges as $ranges) {
foreach ($ranges as $range) {
$values[] = $range->start;
$values[] = $range->end;
}
}
$result .= "static const unsigned int _ucprop_ranges[] = {";
$result .= formatIntArray($values, 4);
$result .= "
};
";
return $result;
}
function flatten(array $array) {
$result = [];
foreach ($array as $arr) {
foreach ($arr as $v) {
$result[] = $v;
}
}
return $result;
}
function prepareCaseData(UnicodeData $data) {
// Don't store titlecase if it's the same as uppercase
foreach ($data->caseMaps['title'] as $code => $titleCode) {
if ($titleCode == ($data->caseMaps['upper'][$code] ?? $code)) {
unset($data->caseMaps['title'][$code]);
}
}
// Store full (multi-char) case mappings in a separate table and only
// store an index into it
foreach ($data->caseMaps as $type => $caseMap) {
foreach ($caseMap as $code => $caseCode) {
if (is_array($caseCode)) {
// -1 because the first entry is the simple case mapping
$len = count($caseCode) - 1;
$idx = count($data->extraCaseData);
$data->caseMaps[$type][$code] = ($len << 24) | $idx;
foreach ($caseCode as $c) {
$data->extraCaseData[] = $c;
}
}
}
}
}
function generateCaseMPH(string $name, array $map) {
$prefix = "_uccase_" . $name;
list($gTable, $table) = generateMPH($map, $fast = false);
echo "$name: n=", count($table), ", g=", count($gTable), "
";
$result = "";
$result .= "static const unsigned {$prefix}_g_size = " . count($gTable) . ";
";
$result .= "static const short {$prefix}_g[] = {";
$result .= formatShortDecArray($gTable, 8);
$result .= "
};
";
$result .= "static const unsigned {$prefix}_table_size = " . count($table) . ";
";
$result .= "static const unsigned {$prefix}_table[] = {";
$result .= formatIntArray(flatten($table), 4);
$result .= "
};
";
return $result;
}
function generateCaseData(UnicodeData $data) {
prepareCaseData($data);
$result = "";
$result .= generateCaseMPH('upper', $data->caseMaps['upper']);
$result .= generateCaseMPH('lower', $data->caseMaps['lower']);
$result .= generateCaseMPH('title', $data->caseMaps['title']);
$result .= generateCaseMPH('fold', $data->caseMaps['fold']);
$result .= "static const unsigned _uccase_extra_table[] = {";
$result .= formatIntArray($data->extraCaseData, 4);
$result .= "
};
";
return $result;
}
function generateData(UnicodeData $data) {
$result = <<<'HEADER'
/* This file was generated from a modified version of UCData's ucgendat.
*
* DO NOT EDIT THIS FILE!
*
* Instead, download the appropriate UnicodeData-x.x.x.txt and
* CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
* and run ext/mbstring/ucgendat/ucgendat.php.
*
* More information can be found in the UCData package. Unfortunately,
* the project's page doesn't seem to be live anymore, so you can use
* OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
HEADER;
$result .= "
" . generatePropData($data);
$result .= generateCaseData($data);
return $result;
}
/*
* Minimal Perfect Hash Generation
*
* Based on "Hash, displace, and compress" algorithm due to
* Belazzougui, Botelho and Dietzfelbinger.
*
* Hash function based on https://stackoverflow.com/a/12996028/385378.
* MPH implementation based on http://stevehanov.ca/blog/index.php?id=119.
*/
function hashInt(int $d, int $x) {
$x ^= $d;
$x = (($x >> 16) ^ $x) * 0x45d9f3b;
return $x & 0xffffffff;
}
function tryGenerateMPH(array $map, int $gSize) {
$tableSize = count($map);
$table = [];
$gTable = array_fill(0, $gSize, 0x7fff);
$buckets = [];
foreach ($map as $k => $v) {
$h = hashInt(0, $k) % $gSize;
$buckets[$h][] = [$k, $v];
}
// Sort by descending number of collisions
usort($buckets, function ($b1, $b2) {
return -(count($b1) <=> count($b2));
});
foreach ($buckets as $bucket) {
$collisions = count($bucket);
if ($collisions <= 1) {
continue;
}
// Try values of $d until all elements placed in different slots
$d = 1;
$i = 0;
$used = [];
while ($i < $collisions) {
if ($d > 0x7fff) {
return [];
}
list($k) = $bucket[$i];
$slot = hashInt($d, $k) % $tableSize;
if (isset($table[$slot]) || isset($used[$slot])) {
$d++;
$i = 0;
$used = [];
} else {
$i++;
$used[$slot] = true;
}
}
$g = hashInt(0, $bucket[0][0]) % $gSize;
$gTable[$g] = $d;
foreach ($bucket as $elem) {
$table[hashInt($d, $elem[0]) % $tableSize] = $elem;
}
}
$freeSlots = [];
for ($i = 0; $i < $tableSize; $i++) {
if (!isset($table[$i])) {
$freeSlots[] = $i;
}
}
// For buckets with only one element, we directly store the index
$freeIdx = 0;
foreach ($buckets as $bucket) {
if (count($bucket) != 1) {
continue;
}
$elem = $bucket[0];
$slot = $freeSlots[$freeIdx++];
$table[$slot] = $elem;
$g = hashInt(0, $elem[0]) % $gSize;
$gTable[$g] = -$slot;
}
ksort($gTable);
ksort($table);
return [$gTable, $table];
}
function generateMPH(array $map, bool $fast) {
if ($fast) {
// Check size starting lambda=5.0 in 0.5 increments
for ($lambda = 5.0;; $lambda -= 0.5) {
$m = (int) (count($map) / $lambda);
$tmpMph = tryGenerateMPH($map, $m);
if (!empty($tmpMph)) {
$mph = $tmpMph;
break;
}
}
} else {
// Check all sizes starting lambda=7.0
$m = (int) (count($map) / 7.0);
for (;; $m++) {
$tmpMph = tryGenerateMPH($map, $m);
if (!empty($tmpMph)) {
$mph = $tmpMph;
break;
}
}
}
return $mph;
}
function generateEastAsianWidthData(array $wideRanges) {
$result = <<<'HEADER'
/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
*
* DO NOT EDIT THIS FILE!
*
* East Asian Width table
*
* Some characters in East Asian languages are intended to be displayed in a space
* which is roughly square. (This contrasts with others such as the Latin alphabet,
* which are taller than they are wide.) To display these East Asian characters
* properly, twice the horizontal space is used. This must be taken into account
* when doing things like wrapping text to a specific width.
*
* Each pair of numbers in the below table is a range of Unicode codepoints
* which should be displayed as double-width.
*/
static const struct {
int begin;
int end;
} mbfl_eaw_table[] = {
HEADER;
foreach ($wideRanges as $range) {
$startCode = dechex($range->start);
$endCode = dechex($range->end);
$result .= " { 0x{$startCode}, 0x{$endCode} },
";
}
$result .= "};
";
return $result;
}
?>
Did this file decode correctly?
Original Code
#!/usr/bin/env php
<?php error_reporting(E_ALL);
/**
* This is based on the ucgendat.c file from the OpenLDAP project, licensed as
* follows. This file is not necessary to build PHP. It's only necessary to
* rebuild unicode_data.h and eaw_width.h from Unicode ucd files.
*
* Example usage:
* php ucgendat.php path/to/Unicode/data/files
*/
/* Copyright 1998-2007 The OpenLDAP Foundation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available at
* <http://www.OpenLDAP.org/license.html>.
*/
/* Copyright 2001 Computing Research Labs, New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
if ($argc < 2) {
echo "Usage: php ucgendata.php ./datadir\n";
echo "./datadir must contain:\n";
echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n";
return;
}
$dir = $argv[1];
$unicodeDataFile = $dir . '/UnicodeData.txt';
$caseFoldingFile = $dir . '/CaseFolding.txt';
$specialCasingFile = $dir . '/SpecialCasing.txt';
$derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
$eastAsianWidthFile = $dir . '/EastAsianWidth.txt';
$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile];
foreach ($files as $file) {
if (!file_exists($file)) {
echo "File $file does not exist.\n";
return;
}
}
$outputFile = __DIR__ . "/../unicode_data.h";
$data = new UnicodeData;
parseUnicodeData($data, file_get_contents($unicodeDataFile));
parseCaseFolding($data, file_get_contents($caseFoldingFile));
parseSpecialCasing($data, file_get_contents($specialCasingFile));
parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
file_put_contents($outputFile, generateData($data));
$eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h";
$eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile));
file_put_contents($eawFile, generateEastAsianWidthData($eawData));
class Range {
public $start;
public $end;
public function __construct(int $start, int $end) {
$this->start = $start;
$this->end = $end;
}
}
class UnicodeData {
public $propIndexes;
public $numProps;
public $propRanges;
public $caseMaps;
public $extraCaseData;
public function __construct() {
/*
* List of properties expected to be found in the Unicode Character Database.
*/
$this->propIndexes = array_flip([
"Mn", "Mc", "Me", "Nd", "Nl", "No",
"Zs", "Zl", "Zp", "Cs", "Co", "Cn",
"Lu", "Ll", "Lt", "Lm", "Lo", "Sm",
"Sc", "Sk", "So", "L", "R", "EN",
"ES", "ET", "AN", "CS", "B", "S",
"WS", "ON", "AL",
"C", "P", "Cased", "Case_Ignorable"
]);
$this->numProps = count($this->propIndexes);
$this->propRanges = array_fill(0, $this->numProps, []);
$this->caseMaps = [
'upper' => [],
'lower' => [],
'title' => [],
'fold' => [],
];
$this->extraCaseData = [];
}
function propToIndex(string $prop) : int {
/* Deal with directionality codes introduced in Unicode 3.0. */
if (in_array($prop, ["BN", "NSM", "PDF", "LRE", "LRO", "RLE", "RLO", "LRI", "RLI", "FSI", "PDI"])) {
/*
* Mark all of these as Other Neutral to preserve compatibility with
* older versions.
*/
$prop = "ON";
}
/* Merge all punctuation into a single category for efficiency of access.
* We're currently not interested in distinguishing different kinds of punctuation. */
if (in_array($prop, ["Pc", "Pd", "Ps", "Pe", "Po", "Pi", "Pf"])) {
$prop = "P";
}
/* Same for control. */
if (in_array($prop, ["Cc", "Cf"])) {
$prop = "C";
}
if (!isset($this->propIndexes[$prop])) {
throw new Exception("Unknown property $prop");
}
return $this->propIndexes[$prop];
}
public function addProp(int $code, string $prop) {
$propIdx = self::propToIndex($prop);
// Check if this extends the last range
$ranges = $this->propRanges[$propIdx];
if (!empty($ranges)) {
$lastRange = $ranges[count($ranges) - 1];
if ($code === $lastRange->end + 1) {
$lastRange->end++;
return;
}
}
$this->propRanges[$propIdx][] = new Range($code, $code);
}
public function addPropRange(int $startCode, int $endCode, string $prop) {
$propIdx = self::propToIndex($prop);
$this->propRanges[$propIdx][] = new Range($startCode, $endCode);
}
public function addCaseMapping(string $case, int $origCode, int $mappedCode) {
$this->caseMaps[$case][$origCode] = $mappedCode;
}
public function compactRangeArray(array $ranges) : array {
// Sort by start codepoint
usort($ranges, function (Range $r1, Range $r2) {
return $r1->start <=> $r2->start;
});
$lastRange = new Range(-1, -1);
$newRanges = [];
foreach ($ranges as $range) {
if ($lastRange->end == -1) {
$lastRange = $range;
} else if ($range->start == $lastRange->end + 1) {
$lastRange->end = $range->end;
} else if ($range->start > $lastRange->end + 1) {
$newRanges[] = $lastRange;
$lastRange = $range;
} else {
throw new Exception(sprintf(
"Overlapping ranges [%x, %x] and [%x, %x]",
$lastRange->start, $lastRange->end,
$range->start, $range->end
));
}
}
if ($lastRange->end != -1) {
$newRanges[] = $lastRange;
}
return $newRanges;
}
public function compactPropRanges() {
foreach ($this->propRanges as &$ranges) {
$ranges = $this->compactRangeArray($ranges);
}
}
}
function parseDataFile(string $input) {
$lines = explode("\n", $input);
foreach ($lines as $line) {
// Strip comments
if (false !== $hashPos = strpos($line, '#')) {
$line = substr($line, 0, $hashPos);
}
// Skip empty lines
$line = trim($line);
if ($line === '') {
continue;
}
$fields = array_map('trim', explode(';', $line));
yield $fields;
}
}
function parseUnicodeData(UnicodeData $data, string $input) : void {
$lines = parseDataFile($input);
foreach ($lines as $fields) {
if (count($fields) != 15) {
throw new Exception("Line does not contain 15 fields");
}
$code = intval($fields[0], 16);
$name = $fields[1];
if ($name === '') {
throw new Exception("Empty name");
}
if ($name[0] === '<' && $name !== '<control>') {
// This is a character range
$lines->next();
$nextFields = $lines->current();
$nextCode = intval($nextFields[0], 16);
$generalCategory = $fields[2];
$data->addPropRange($code, $nextCode, $generalCategory);
$bidiClass = $fields[4];
$data->addPropRange($code, $nextCode, $bidiClass);
continue;
}
$generalCategory = $fields[2];
$data->addProp($code, $generalCategory);
$bidiClass = $fields[4];
$data->addProp($code, $bidiClass);
$upperCase = intval($fields[12], 16);
$lowerCase = intval($fields[13], 16);
$titleCase = intval($fields[14], 16) ?: $upperCase;
if ($upperCase) {
$data->addCaseMapping('upper', $code, $upperCase);
}
if ($lowerCase) {
$data->addCaseMapping('lower', $code, $lowerCase);
}
if ($titleCase) {
$data->addCaseMapping('title', $code, $titleCase);
}
}
}
function parseCodes(string $strCodes) : array {
$codes = [];
foreach (explode(' ', $strCodes) as $strCode) {
$codes[] = intval($strCode, 16);
}
return $codes;
}
function parseCaseFolding(UnicodeData $data, string $input) : void {
foreach (parseDataFile($input) as $fields) {
if (count($fields) != 4) {
throw new Exception("Line does not contain 4 fields");
}
$code = intval($fields[0], 16);
$status = $fields[1];
if ($status == 'T') {
// Use language-agnostic case folding
continue;
}
if ($status == 'C' || $status == 'S') {
$foldCode = intval($fields[2], 16);
if (!isset($data->caseMaps['fold'][$code])) {
$data->addCaseMapping('fold', $code, $foldCode);
} else {
// Add simple mapping to full mapping data
assert(is_array($data->caseMaps['fold'][$code]));
$data->caseMaps['fold'][$code][0] = $foldCode;
}
} else if ($status == 'F') {
$foldCodes = parseCodes($fields[2]);
$existingFoldCode = $data->caseMaps['fold'][$code] ?? $code;
$data->caseMaps['fold'][$code] = array_merge([$code], $foldCodes);
} else {
assert(0);
}
}
}
function addSpecialCasing(UnicodeData $data, string $type, int $code, array $caseCodes) : void {
$simpleCaseCode = $data->caseMaps[$type][$code] ?? $code;
if (count($caseCodes) == 1) {
if ($caseCodes[0] != $simpleCaseCode) {
throw new Exception("Simple case code in special casing does not match");
}
// Special case: If a title-case character maps to itself, we may still have to store it,
// if there is a non-trivial upper-case mapping for it
if ($type == 'title' && $code == $caseCodes[0]
&& ($data->caseMaps['upper'][$code] ?? $code) != $code) {
$data->caseMaps['title'][$code] = $code;
}
return;
}
if (count($caseCodes) > 3) {
throw new Exception("Special case mapping with more than 3 code points");
}
$data->caseMaps[$type][$code] = array_merge([$simpleCaseCode], $caseCodes);
}
function parseSpecialCasing(UnicodeData $data, string $input) : void {
foreach (parseDataFile($input) as $fields) {
if (count($fields) != 5 && count($fields) != 6) {
throw new Exception("Line does not contain 5 or 6 fields");
}
$code = intval($fields[0], 16);
$lower = parseCodes($fields[1]);
$title = parseCodes($fields[2]);
$upper = parseCodes($fields[3]);
$cond = $fields[4];
if ($cond) {
// Only use unconditional mappings
continue;
}
addSpecialCasing($data, 'lower', $code, $lower);
addSpecialCasing($data, 'upper', $code, $upper);
// Should happen last
addSpecialCasing($data, 'title', $code, $title);
}
}
function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
foreach (parseDataFile($input) as $fields) {
if (count($fields) != 2) {
throw new Exception("Line does not contain 2 fields");
}
$property = $fields[1];
if ($property != 'Cased' && $property != 'Case_Ignorable') {
continue;
}
$range = explode('..', $fields[0]);
if (count($range) == 2) {
$data->addPropRange(intval($range[0], 16), intval($range[1], 16), $property);
} else if (count($range) == 1) {
$data->addProp(intval($range[0], 16), $property);
} else {
throw new Exception("Invalid range");
}
}
}
function parseEastAsianWidth(string $input) : array {
$wideRanges = [];
foreach (parseDataFile($input) as $fields) {
if ($fields[1] == 'W' || $fields[1] == 'F') {
if ($dotsPos = strpos($fields[0], '..')) {
$startCode = intval(substr($fields[0], 0, $dotsPos), 16);
$endCode = intval(substr($fields[0], $dotsPos + 2), 16);
if (!empty($wideRanges)) {
$lastRange = $wideRanges[count($wideRanges) - 1];
if ($startCode == $lastRange->end + 1) {
$lastRange->end = $endCode;
continue;
}
}
$wideRanges[] = new Range($startCode, $endCode);
} else {
$code = intval($fields[0], 16);
if (!empty($wideRanges)) {
$lastRange = $wideRanges[count($wideRanges) - 1];
if ($code == $lastRange->end + 1) {
$lastRange->end++;
continue;
}
}
$wideRanges[] = new Range($code, $code);
}
}
}
return $wideRanges;
}
function formatArray(array $values, int $width, string $format) : string {
$result = '';
$i = 0;
$c = count($values);
for ($i = 0; $i < $c; $i++) {
if ($i != 0) {
$result .= ',';
}
$result .= $i % $width == 0 ? "\n\t" : " ";
$result .= sprintf($format, $values[$i]);
}
return $result;
}
function formatShortHexArray(array $values, int $width) : string {
return formatArray($values, $width, "0x%04x");
}
function formatShortDecArray(array $values, int $width) : string {
return formatArray($values, $width, "% 5d");
}
function formatIntArray(array $values, int $width) : string {
return formatArray($values, $width, "0x%08x");
}
function generatePropData(UnicodeData $data) {
$data->compactPropRanges();
$propOffsets = [];
$idx = 0;
foreach ($data->propRanges as $ranges) {
$num = count($ranges);
$propOffsets[] = $idx;
$idx += 2*$num;
}
// Add sentinel for binary search
$propOffsets[] = $idx;
// TODO ucgendat.c pads the prop offsets to the next multiple of 4
// for rather dubious reasons of alignment. This should probably be
// dropped
while (count($propOffsets) % 4 != 0) {
$propOffsets[] = 0;
}
$totalRanges = $idx;
$result = "";
$result .= "static const unsigned short _ucprop_size = $data->numProps;\n\n";
$result .= "static const unsigned short _ucprop_offsets[] = {";
$result .= formatShortHexArray($propOffsets, 8);
$result .= "\n};\n\n";
$values = [];
foreach ($data->propRanges as $ranges) {
foreach ($ranges as $range) {
$values[] = $range->start;
$values[] = $range->end;
}
}
$result .= "static const unsigned int _ucprop_ranges[] = {";
$result .= formatIntArray($values, 4);
$result .= "\n};\n\n";
return $result;
}
function flatten(array $array) {
$result = [];
foreach ($array as $arr) {
foreach ($arr as $v) {
$result[] = $v;
}
}
return $result;
}
function prepareCaseData(UnicodeData $data) {
// Don't store titlecase if it's the same as uppercase
foreach ($data->caseMaps['title'] as $code => $titleCode) {
if ($titleCode == ($data->caseMaps['upper'][$code] ?? $code)) {
unset($data->caseMaps['title'][$code]);
}
}
// Store full (multi-char) case mappings in a separate table and only
// store an index into it
foreach ($data->caseMaps as $type => $caseMap) {
foreach ($caseMap as $code => $caseCode) {
if (is_array($caseCode)) {
// -1 because the first entry is the simple case mapping
$len = count($caseCode) - 1;
$idx = count($data->extraCaseData);
$data->caseMaps[$type][$code] = ($len << 24) | $idx;
foreach ($caseCode as $c) {
$data->extraCaseData[] = $c;
}
}
}
}
}
function generateCaseMPH(string $name, array $map) {
$prefix = "_uccase_" . $name;
list($gTable, $table) = generateMPH($map, $fast = false);
echo "$name: n=", count($table), ", g=", count($gTable), "\n";
$result = "";
$result .= "static const unsigned {$prefix}_g_size = " . count($gTable) . ";\n";
$result .= "static const short {$prefix}_g[] = {";
$result .= formatShortDecArray($gTable, 8);
$result .= "\n};\n\n";
$result .= "static const unsigned {$prefix}_table_size = " . count($table) . ";\n";
$result .= "static const unsigned {$prefix}_table[] = {";
$result .= formatIntArray(flatten($table), 4);
$result .= "\n};\n\n";
return $result;
}
function generateCaseData(UnicodeData $data) {
prepareCaseData($data);
$result = "";
$result .= generateCaseMPH('upper', $data->caseMaps['upper']);
$result .= generateCaseMPH('lower', $data->caseMaps['lower']);
$result .= generateCaseMPH('title', $data->caseMaps['title']);
$result .= generateCaseMPH('fold', $data->caseMaps['fold']);
$result .= "static const unsigned _uccase_extra_table[] = {";
$result .= formatIntArray($data->extraCaseData, 4);
$result .= "\n};\n\n";
return $result;
}
function generateData(UnicodeData $data) {
$result = <<<'HEADER'
/* This file was generated from a modified version of UCData's ucgendat.
*
* DO NOT EDIT THIS FILE!
*
* Instead, download the appropriate UnicodeData-x.x.x.txt and
* CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
* and run ext/mbstring/ucgendat/ucgendat.php.
*
* More information can be found in the UCData package. Unfortunately,
* the project's page doesn't seem to be live anymore, so you can use
* OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
HEADER;
$result .= "\n\n" . generatePropData($data);
$result .= generateCaseData($data);
return $result;
}
/*
* Minimal Perfect Hash Generation
*
* Based on "Hash, displace, and compress" algorithm due to
* Belazzougui, Botelho and Dietzfelbinger.
*
* Hash function based on https://stackoverflow.com/a/12996028/385378.
* MPH implementation based on http://stevehanov.ca/blog/index.php?id=119.
*/
function hashInt(int $d, int $x) {
$x ^= $d;
$x = (($x >> 16) ^ $x) * 0x45d9f3b;
return $x & 0xffffffff;
}
function tryGenerateMPH(array $map, int $gSize) {
$tableSize = count($map);
$table = [];
$gTable = array_fill(0, $gSize, 0x7fff);
$buckets = [];
foreach ($map as $k => $v) {
$h = hashInt(0, $k) % $gSize;
$buckets[$h][] = [$k, $v];
}
// Sort by descending number of collisions
usort($buckets, function ($b1, $b2) {
return -(count($b1) <=> count($b2));
});
foreach ($buckets as $bucket) {
$collisions = count($bucket);
if ($collisions <= 1) {
continue;
}
// Try values of $d until all elements placed in different slots
$d = 1;
$i = 0;
$used = [];
while ($i < $collisions) {
if ($d > 0x7fff) {
return [];
}
list($k) = $bucket[$i];
$slot = hashInt($d, $k) % $tableSize;
if (isset($table[$slot]) || isset($used[$slot])) {
$d++;
$i = 0;
$used = [];
} else {
$i++;
$used[$slot] = true;
}
}
$g = hashInt(0, $bucket[0][0]) % $gSize;
$gTable[$g] = $d;
foreach ($bucket as $elem) {
$table[hashInt($d, $elem[0]) % $tableSize] = $elem;
}
}
$freeSlots = [];
for ($i = 0; $i < $tableSize; $i++) {
if (!isset($table[$i])) {
$freeSlots[] = $i;
}
}
// For buckets with only one element, we directly store the index
$freeIdx = 0;
foreach ($buckets as $bucket) {
if (count($bucket) != 1) {
continue;
}
$elem = $bucket[0];
$slot = $freeSlots[$freeIdx++];
$table[$slot] = $elem;
$g = hashInt(0, $elem[0]) % $gSize;
$gTable[$g] = -$slot;
}
ksort($gTable);
ksort($table);
return [$gTable, $table];
}
function generateMPH(array $map, bool $fast) {
if ($fast) {
// Check size starting lambda=5.0 in 0.5 increments
for ($lambda = 5.0;; $lambda -= 0.5) {
$m = (int) (count($map) / $lambda);
$tmpMph = tryGenerateMPH($map, $m);
if (!empty($tmpMph)) {
$mph = $tmpMph;
break;
}
}
} else {
// Check all sizes starting lambda=7.0
$m = (int) (count($map) / 7.0);
for (;; $m++) {
$tmpMph = tryGenerateMPH($map, $m);
if (!empty($tmpMph)) {
$mph = $tmpMph;
break;
}
}
}
return $mph;
}
function generateEastAsianWidthData(array $wideRanges) {
$result = <<<'HEADER'
/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
*
* DO NOT EDIT THIS FILE!
*
* East Asian Width table
*
* Some characters in East Asian languages are intended to be displayed in a space
* which is roughly square. (This contrasts with others such as the Latin alphabet,
* which are taller than they are wide.) To display these East Asian characters
* properly, twice the horizontal space is used. This must be taken into account
* when doing things like wrapping text to a specific width.
*
* Each pair of numbers in the below table is a range of Unicode codepoints
* which should be displayed as double-width.
*/
static const struct {
int begin;
int end;
} mbfl_eaw_table[] = {
HEADER;
foreach ($wideRanges as $range) {
$startCode = dechex($range->start);
$endCode = dechex($range->end);
$result .= "\t{ 0x{$startCode}, 0x{$endCode} },\n";
}
$result .= "};\n";
return $result;
}
Function Calls
None |
Stats
MD5 | 80a92297ba0564647e75a689e830d179 |
Eval Count | 0 |
Decode Time | 96 ms |