|
| 1 | +.. highlight:: python |
| 2 | + :linenothreshold: 10 |
| 3 | + |
| 4 | +.. toctree:: |
| 5 | + :maxdepth: 2 |
| 6 | + |
| 7 | +==================================== |
| 8 | +Python Unicode Strings and C++ |
| 9 | +==================================== |
| 10 | + |
| 11 | +Yes Unicode is a pain but it here to stay, particularly with Python 3. This section looks at how you can bridge between Python and C++ unicode in Python extensions. This section is only about Python 3+ and C++11 or more. |
| 12 | + |
| 13 | +Whilst Python is Unicode aware C++ is not, well C++11 added ``std::basic_string`` specialisations for 2 and 4 byte 'Unicode' characters but these are just containers, they have no real awareness of what they contain. |
| 14 | + |
| 15 | +------------------------------------ |
| 16 | +Basic Handling of Unicode |
| 17 | +------------------------------------ |
| 18 | + |
| 19 | +The task here is to: |
| 20 | + |
| 21 | +#. Take any Python Unicode string as an argument. |
| 22 | +#. Convert it into an appropriate C++ container. |
| 23 | +#. Dump that C++ container out to ``std::cout``. |
| 24 | +#. Create and new Python Unicode string from that C++ container and return it. |
| 25 | + |
| 26 | +This is just show that we can round-trip between the internal representations of the two languages. |
| 27 | + |
| 28 | +Here is the despatch function that takes a single Unicode argument (note the ``"U"`` specification) and calls the appropriate handling function: |
| 29 | + |
| 30 | +.. code-block:: cpp |
| 31 | +
|
| 32 | + /* Handler functions, defined later. */ |
| 33 | + PyObject *unicode_1_to_string_and_back(PyObject *py_str); |
| 34 | + PyObject *unicode_2_to_string_and_back(PyObject *py_str); |
| 35 | + PyObject *unicode_4_to_string_and_back(PyObject *py_str); |
| 36 | +
|
| 37 | + PyObject* |
| 38 | + unicode_to_string_and_back(PyObject * /* module */, PyObject *args) { |
| 39 | + PyObject *py_str = NULL; |
| 40 | + PyObject *ret_val = NULL; |
| 41 | + if (PyArg_ParseTuple(args, "U", &py_str)) { |
| 42 | + switch (PyUnicode_KIND(py_str)) { |
| 43 | + case PyUnicode_1BYTE_KIND: |
| 44 | + ret_val = unicode_1_to_string_and_back(py_str); |
| 45 | + break; |
| 46 | + case PyUnicode_2BYTE_KIND: |
| 47 | + ret_val = unicode_2_to_string_and_back(py_str); |
| 48 | + break; |
| 49 | + case PyUnicode_4BYTE_KIND: |
| 50 | + ret_val = unicode_4_to_string_and_back(py_str); |
| 51 | + break; |
| 52 | + default: |
| 53 | + PyErr_Format(PyExc_ValueError, |
| 54 | + "In %s argument is not recognised as a Unicode 1, 2, 4 byte string", |
| 55 | + __FUNCTION__); |
| 56 | + break; |
| 57 | + } |
| 58 | + } |
| 59 | + return ret_val; |
| 60 | + } |
| 61 | +
|
| 62 | +The three handler functions are here, they use ``std::string``, ``std::u16string`` and ``std::u32string`` as appropriate: |
| 63 | + |
| 64 | +.. code-block:: c |
| 65 | +
|
| 66 | + PyObject* |
| 67 | + unicode_1_to_string_and_back(PyObject *py_str) { |
| 68 | + assert(PyUnicode_KIND(py_str) == PyUnicode_1BYTE_KIND); |
| 69 | + std::string result = std::string((char*)PyUnicode_1BYTE_DATA(py_str)); |
| 70 | + dump_string(result); |
| 71 | + return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, |
| 72 | + result.c_str(), |
| 73 | + result.size()); |
| 74 | + } |
| 75 | +
|
| 76 | + PyObject* |
| 77 | + unicode_2_to_string_and_back(PyObject *py_str) { |
| 78 | + assert(PyUnicode_KIND(py_str) == PyUnicode_2BYTE_KIND); |
| 79 | + // std::u16string is a std::basic_string<char16_t> |
| 80 | + std::u16string result = std::u16string((char16_t*)PyUnicode_2BYTE_DATA(py_str)); |
| 81 | + dump_string(result); |
| 82 | + return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, |
| 83 | + result.c_str(), |
| 84 | + result.size()); |
| 85 | + } |
| 86 | +
|
| 87 | + PyObject* |
| 88 | + unicode_4_to_string_and_back(PyObject *py_str) { |
| 89 | + assert(PyUnicode_KIND(py_str) == PyUnicode_4BYTE_KIND); |
| 90 | + // std::u32string is a std::basic_string<char32_t> |
| 91 | + std::u32string result = std::u32string((char32_t*)PyUnicode_4BYTE_DATA(py_str)); |
| 92 | + dump_string(result); |
| 93 | + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, |
| 94 | + result.c_str(), |
| 95 | + result.size()); |
| 96 | + } |
| 97 | +
|
| 98 | +Each of these calls ``dump_string`` which is a template function: |
| 99 | + |
| 100 | +.. code-block:: cpp |
| 101 | +
|
| 102 | + template <typename T> |
| 103 | + void dump_string(const std::basic_string<T> &str) { |
| 104 | + std::cout << "String size: " << str.size(); |
| 105 | + std::cout << " word size: " << sizeof(T) << std::endl; |
| 106 | + for (size_t i = 0; i < str.size(); ++i) { |
| 107 | + std::cout << std::setfill('0'); |
| 108 | + std::cout << "0x" << std::hex; |
| 109 | + std::cout << std::setw(2 * sizeof(T)) << static_cast<int>(str[i]); |
| 110 | + std::cout << " " << std::dec << std::setw(8) << static_cast<int>(str[i]); |
| 111 | + std::cout << std::setfill(' '); |
| 112 | + std::cout << " \"" << str[i] << "\""<< std::endl; |
| 113 | + } |
| 114 | + } |
| 115 | +
|
| 116 | +For completeness here is the module code that creates a ``cUnicode`` module with a single ``show()`` function: |
| 117 | + |
| 118 | +.. code-block:: c |
| 119 | +
|
| 120 | + static PyMethodDef cUnicode_Methods[] = { |
| 121 | + {"show", (PyCFunction)unicode_to_string_and_back, METH_VARARGS, |
| 122 | + "Convert a Python unicode string to std::string and back."}, |
| 123 | + {NULL, NULL, 0, NULL} /* Sentinel */ |
| 124 | + }; |
| 125 | +
|
| 126 | + static PyModuleDef cUnicodemodule = { |
| 127 | + PyModuleDef_HEAD_INIT, |
| 128 | + "cUnicode", |
| 129 | + "cUnicode works with unicode strings.", |
| 130 | + -1, |
| 131 | + cUnicode_Methods, |
| 132 | + NULL, NULL, NULL, NULL |
| 133 | + }; |
| 134 | +
|
| 135 | + PyMODINIT_FUNC |
| 136 | + PyInit_cUnicode(void) |
| 137 | + { |
| 138 | + PyObject* m; |
| 139 | +
|
| 140 | + m = PyModule_Create(&cUnicodemodule); |
| 141 | + if (m == NULL) |
| 142 | + return NULL; |
| 143 | + return m; |
| 144 | + } |
| 145 | +
|
| 146 | +Here is an example of using this module: |
| 147 | + |
| 148 | +.. code-block:: py |
| 149 | +
|
| 150 | + >>> import cUnicode |
| 151 | + >>> cUnicode.show('Hello') |
| 152 | + String size: 5 word size: 1 |
| 153 | + 0x00000048 72 "H" |
| 154 | + 0x00000065 101 "e" |
| 155 | + 0x0000006c 108 "l" |
| 156 | + 0x0000006c 108 "l" |
| 157 | + 0x0000006f 111 "o" |
| 158 | + 'Hello' |
| 159 | + >>> s = "a\xac\u1234\u20ac\U00008000" |
| 160 | + >>> r = cUnicode.show(s) |
| 161 | + String size: 5 word size: 2 |
| 162 | + 0x00000061 97 "97" |
| 163 | + 0x000000ac 172 "172" |
| 164 | + 0x00001234 4660 "4660" |
| 165 | + 0x000020ac 8364 "8364" |
| 166 | + 0x00008000 32768 "32768" |
| 167 | + >>> r == s |
| 168 | + True |
| 169 | + >>> s = "a\xac\u1234\u20ac\U00018000" |
| 170 | + >>> r = cUnicode.show(s) |
| 171 | + String size: 5 word size: 4 |
| 172 | + 0x00000061 97 "97" |
| 173 | + 0x000000ac 172 "172" |
| 174 | + 0x00001234 4660 "4660" |
| 175 | + 0x000020ac 8364 "8364" |
| 176 | + 0x00018000 98304 "98304" |
| 177 | + >>> r == s |
| 178 | + True |
| 179 | +
|
| 180 | +----------------------------------------------------------------------- |
| 181 | +Working with ``bytes``, ``bytearray`` and UTF-8 Unicode Arguments |
| 182 | +----------------------------------------------------------------------- |
| 183 | + |
| 184 | +It is fairly common to want to convert an argumennt that is ``bytes``, ``bytearray`` or UTF-8 to a ``std::string``. This function willl do just that: |
| 185 | + |
| 186 | +.. code-block:: c |
| 187 | +
|
| 188 | + /* Convert a PyObject to a std::string and return 0 if succesful. |
| 189 | + * If py_str is Unicode than treat it as UTF-8. |
| 190 | + * This works with Python 2.7 and Python 3.4 onwards. |
| 191 | + */ |
| 192 | + int py_string_to_std_string(const PyObject *py_str, |
| 193 | + std::string &result, |
| 194 | + bool utf8_only=true) { |
| 195 | + result.clear(); |
| 196 | + if (PyBytes_Check(py_str)) { |
| 197 | + result = std::string(PyBytes_AS_STRING(py_str)); |
| 198 | + return 0; |
| 199 | + } |
| 200 | + if (PyByteArray_Check(py_str)) { |
| 201 | + result = std::string(PyByteArray_AS_STRING(py_str)); |
| 202 | + return 0; |
| 203 | + } |
| 204 | + // Must be unicode then. |
| 205 | + if (! PyUnicode_Check(py_str)) { |
| 206 | + PyErr_Format(PyExc_ValueError, |
| 207 | + "In %s \"py_str\" failed PyUnicode_Check()", |
| 208 | + __FUNCTION__); |
| 209 | + return -1; |
| 210 | + } |
| 211 | + if (PyUnicode_READY(py_str)) { |
| 212 | + PyErr_Format(PyExc_ValueError, |
| 213 | + "In %s \"py_str\" failed PyUnicode_READY()", |
| 214 | + __FUNCTION__); |
| 215 | + return -2; |
| 216 | + } |
| 217 | + if (utf8_only && PyUnicode_KIND(py_str) != PyUnicode_1BYTE_KIND) { |
| 218 | + PyErr_Format(PyExc_ValueError, |
| 219 | + "In %s \"py_str\" not utf-8", |
| 220 | + __FUNCTION__); |
| 221 | + return -3; |
| 222 | + } |
| 223 | + // Python 3 and its minor versions (they vary) |
| 224 | + // const Py_UCS1 *pChrs = PyUnicode_1BYTE_DATA(pyStr); |
| 225 | + // result = std::string(reinterpret_cast<const char*>(pChrs)); |
| 226 | + #if PY_MAJOR_VERSION >= 3 |
| 227 | + result = std::string((char*)PyUnicode_1BYTE_DATA(py_str)); |
| 228 | + #else |
| 229 | + // Nasty cast away constness because PyString_AsString takes non-const in Py2 |
| 230 | + result = std::string((char*)PyString_AsString(const_cast<PyObject *>(py_str))); |
| 231 | + #endif |
| 232 | + return 0; |
| 233 | + } |
| 234 | +
|
| 235 | +And these three do the reverse: |
| 236 | + |
| 237 | +.. code-block:: c |
| 238 | +
|
| 239 | + PyObject* |
| 240 | + std_string_to_py_bytes(const std::string &str) { |
| 241 | + return PyBytes_FromStringAndSize(str.c_str(), str.size()); |
| 242 | + } |
| 243 | +
|
| 244 | + PyObject* |
| 245 | + std_string_to_py_bytearray(const std::string &str) { |
| 246 | + return PyByteArray_FromStringAndSize(str.c_str(), str.size()); |
| 247 | + } |
| 248 | +
|
| 249 | + PyObject* |
| 250 | + std_string_to_py_utf8(const std::string &str) { |
| 251 | + // Equivelent to: |
| 252 | + // PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, str.c_str(), str.size()); |
| 253 | + return PyUnicode_FromStringAndSize(str.c_str(), str.size()); |
| 254 | + } |
| 255 | +
|
| 256 | +
|
0 commit comments