Skip to content

Commit

Permalink
added support for decoding Java's weird modified UTF-8 encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
KurtThiemann committed Nov 29, 2022
1 parent 859f4a6 commit 4639486
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 1 deletion.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,18 @@ $myInt->setValue(42);
echo $myInt->getValue(); // 42
```

On String tags, `getValue()` and `setValue()` will use the raw string data, which uses Java's modified
UTF-8 encoding. To use different encodings,
use `getDecodedValue($encoding = "UTF-8")` and `setDecodedValue($value, $encoding = "UTF-8")` instead.
A list of supported encodings is returned by the `mb_list_encodings()` function.

```php
$myString new \Aternos\Nbt\Tag\StringTag();

$myString->setDecodedValue("Hello world!");
echo $myString->getDecodedValue(); // Hello world!
```

Compound tags, list tags, and array tags implement the `ArrayAccess`, `Countable`,
and `Iterator` interfaces and can therefore be accessed as arrays.
```php
Expand Down
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"php": ">=8.0",
"php-64bit": "*",
"ext-zlib": "*",
"ext-json": "*"
"ext-json": "*",
"ext-mbstring": "*"
}
}
139 changes: 139 additions & 0 deletions src/String/JavaEncoding.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
<?php

namespace Aternos\Nbt\String;

/**
* https://py2jdbc.readthedocs.io/en/latest/mutf8.html
* https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8
* https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#readUTF%28%29
* Good luck
*/
class JavaEncoding
{
protected static ?JavaEncoding $instance = null;

/**
* @return static
*/
static function getInstance(): static
{
if (static::$instance === null) {
static::$instance = new static();
}
return static::$instance;
}

/**
* @param string $string
* @param string $sourceEncoding
* @return string
*/
public function encode(string $string, string $sourceEncoding = "UTF-8"): string
{
$result = "";

$chars = mb_str_split($string, 1, $sourceEncoding);
foreach ($chars as $char) {
$c = mb_ord($char, $sourceEncoding);

if($c === 0) {
$result .= "\xC0\x80";
continue;
}

if($c <= 0x7F) {
$result .= chr($c);
continue;
}

if($c <= 0x7FF) {
$result .= chr(0xC0 | (0x1F & ($c >> 0x06)));
$result .= chr(0x80 | (0x3F & $c));
continue;
}

if($c <= 0xFFFF) {
$result .= chr(0xE0 | (0x0F & ($c >> 0x0C)));
$result .= chr(0x80 | (0x3F & ($c >> 0x06)));
$result .= chr(0x80 | (0x3F & $c));
continue;
}

$result .= chr(0xED);
$result .= chr(0xA0 | (($c >> 0x10) & 0x0F));
$result .= chr(0x80 | (($c >> 0x0A) & 0x3f));
$result .= chr(0xED);
$result .= chr(0xb0 | (($c >> 0x06) & 0x0f));
$result .= chr(0x80 | ($c & 0x3f));
}

return $result;
}

/**
* @throws StringDataFormatException
*/
public function decode(string $string, string $outputEncoding = "UTF-8"): string
{
$result = "";
for ($i = 0; $i < strlen($string); $i++) {
$a = ord($string[$i]);

if ($a === 0) {
throw new StringDataFormatException("Invalid NULL byte in string");
}

// Single byte character
if (($a & 0b10000000) === 0b0) {
$result .= mb_chr($a, $outputEncoding);
continue;
}

$b = ord($string[++$i] ?? "\0");

// Two byte character
if (($a & 0b11100000) === 0b11000000) {
if (($b & 0b11000000) !== 0b10000000) {
throw new StringDataFormatException("Invalid \"UTF-8\" sequence");
}

$result .= mb_chr((($a & 0x1F) << 6) | ($b & 0x3F), $outputEncoding);
continue;
}

$c = ord($string[++$i] ?? "\0");

// Maybe six byte character
if ($a === 0b11101101 && ($b & 0b11110000) === 0b10100000 && ($c & 0b11000000) === 0b10000000) {
$d = ord($string[$i + 1] ?? "\0");
$e = ord($string[$i + 2] ?? "\0");
$f = ord($string[$i + 3] ?? "\0");

// Six byte character
if ($d === 0b11101101 && ($e & 0b11110000) === 0b10110000 && ($f & 0b11000000) === 0b10000000) {
$result .= mb_chr(0x10000 |
($b & 0x0F) << 0x10 |
($c & 0x3F) << 0x0A |
($e & 0x0F) << 0x06 |
($f & 0x3F), $outputEncoding);

$i += 3;
continue;
}
}

// Three byte character
if (($a & 0b11110000) === 0b11100000) {
if (($b & 0b11000000) !== 0b10000000 || ($c & 0b11000000) !== 0b10000000) {
throw new StringDataFormatException("Invalid \"UTF-8\" sequence");
}

$result .= mb_chr((($a & 0x0F) << 12) | (($b & 0x3F) << 6) | ($c & 0x3F), $outputEncoding);
continue;
}

throw new StringDataFormatException("Invalid \"UTF-8\" sequence");
}
return $result;
}
}
8 changes: 8 additions & 0 deletions src/String/StringDataFormatException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?php

namespace Aternos\Nbt\String;

class StringDataFormatException extends \Exception
{

}
23 changes: 23 additions & 0 deletions src/Tag/StringTag.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

use Aternos\Nbt\IO\Reader\Reader;
use Aternos\Nbt\IO\Writer\Writer;
use Aternos\Nbt\String\JavaEncoding;
use Aternos\Nbt\String\StringDataFormatException;
use Exception;

class StringTag extends Tag
Expand All @@ -20,6 +22,16 @@ public function getValue(): string
return $this->value;
}

/**
* @param string $encoding
* @return string
* @throws StringDataFormatException
*/
public function getDecodedValue(string $encoding = "UTF-8"): string
{
return JavaEncoding::getInstance()->decode($this->value, $encoding);
}

/**
* @param string $value
* @return StringTag
Expand All @@ -30,6 +42,17 @@ public function setValue(string $value): StringTag
return $this;
}

/**
* @param string $value
* @param string $encoding
* @return StringTag
*/
public function setDecodedValue(string $value, string $encoding = "UTF-8"): StringTag
{
$this->value = JavaEncoding::getInstance()->encode($value, $encoding);
return $this;
}

/**
* @return int
*/
Expand Down

0 comments on commit 4639486

Please sign in to comment.