Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 59 additions & 50 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -6089,61 +6089,73 @@ _PyUnicode_EncodeUTF32(PyObject *str,
const char *errors,
int byteorder)
{
int kind;
const void *data;
Py_ssize_t len;
PyObject *v;
uint32_t *out;
if (!PyUnicode_Check(str)) {
PyErr_BadArgument();
return NULL;
}
int kind = PyUnicode_KIND(str);
const void *data = PyUnicode_DATA(str);
Py_ssize_t len = PyUnicode_GET_LENGTH(str);

if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
return PyErr_NoMemory();
Py_ssize_t nsize = len + (byteorder == 0);

#if PY_LITTLE_ENDIAN
int native_ordering = byteorder <= 0;
#else
int native_ordering = byteorder >= 0;
#endif
const char *encoding;
Py_ssize_t nsize, pos;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *rep = NULL;

if (!PyUnicode_Check(str)) {
PyErr_BadArgument();
return NULL;
if (kind == PyUnicode_1BYTE_KIND) {
// gh-139156: Don't use PyBytesWriter API here since it has an overhead
// on short strings
PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
if (v == NULL) {
return NULL;
}

/* output buffer is 4-bytes aligned */
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
if (byteorder == 0) {
*out++ = 0xFEFF;
}
if (len > 0) {
ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
&out, native_ordering);
}
return v;
}
kind = PyUnicode_KIND(str);
data = PyUnicode_DATA(str);
len = PyUnicode_GET_LENGTH(str);

if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
return PyErr_NoMemory();
nsize = len + (byteorder == 0);
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
if (v == NULL)
PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
if (writer == NULL) {
return NULL;
}

/* output buffer is 4-bytes aligned */
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
out = (uint32_t *)PyBytes_AS_STRING(v);
if (byteorder == 0)
assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
if (byteorder == 0) {
*out++ = 0xFEFF;
if (len == 0)
goto done;
}
if (len == 0) {
return PyBytesWriter_Finish(writer);
}

const char *encoding;
if (byteorder == -1)
encoding = "utf-32-le";
else if (byteorder == 1)
encoding = "utf-32-be";
else
encoding = "utf-32";

if (kind == PyUnicode_1BYTE_KIND) {
ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
goto done;
}

pos = 0;
while (pos < len) {
Py_ssize_t newpos, repsize, moreunits;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *rep = NULL;

for (Py_ssize_t pos = 0; pos < len; ) {
if (kind == PyUnicode_2BYTE_KIND) {
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
&out, native_ordering);
Expand All @@ -6156,13 +6168,15 @@ _PyUnicode_EncodeUTF32(PyObject *str,
if (pos == len)
break;

Py_ssize_t newpos;
rep = unicode_encode_call_errorhandler(
errors, &errorHandler,
encoding, "surrogates not allowed",
str, &exc, pos, pos + 1, &newpos);
if (!rep)
goto error;

Py_ssize_t repsize, moreunits;
if (PyBytes_Check(rep)) {
repsize = PyBytes_GET_SIZE(rep);
if (repsize & 3) {
Expand All @@ -6188,21 +6202,18 @@ _PyUnicode_EncodeUTF32(PyObject *str,

/* four bytes are reserved for each surrogate */
if (moreunits > 0) {
Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
/* integer overflow */
PyErr_NoMemory();
out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
if (out == NULL) {
goto error;
}
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
goto error;
out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
}

if (PyBytes_Check(rep)) {
memcpy(out, PyBytes_AS_STRING(rep), repsize);
out += repsize / 4;
} else /* rep is unicode */ {
}
else {
/* rep is unicode */
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
&out, native_ordering);
Expand All @@ -6211,21 +6222,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
Py_CLEAR(rep);
}

Py_XDECREF(errorHandler);
Py_XDECREF(exc);

/* Cut back to size actually needed. This is necessary for, for example,
encoding of a string containing isolated surrogates and the 'ignore'
handler is used. */
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
if (nsize != PyBytes_GET_SIZE(v))
_PyBytes_Resize(&v, nsize);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
done:
return v;
return PyBytesWriter_FinishWithPointer(writer, out);

error:
Py_XDECREF(rep);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
Py_XDECREF(v);
PyBytesWriter_Discard(writer);
return NULL;
}

Expand Down
Loading