Skip to content

Commit af1d027

Browse files
committed
Adds some advice on unicode handling.
1 parent c76212d commit af1d027

File tree

2 files changed

+257
-0
lines changed

2 files changed

+257
-0
lines changed

doc/sphinx/source/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ This describes reliable patterns of coding Python Extensions in C. It covers the
1818
new_types
1919
module_globals
2020
super_call
21+
unicode_and_cpp
2122
compiler_flags
2223
debugging/debug
2324
thread_safety

doc/sphinx/source/unicode_and_cpp.rst

+256
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
.. highlight:: python
2+
:linenothreshold: 10
3+
4+
.. toctree::
5+
:maxdepth: 2
6+
7+
====================================
8+
Python Unicode Strings and C++
9+
====================================
10+
11+
Yes Unicode is a pain but it here to stay, particularly with Python 3. This section looks at how you can bridge between Python and C++ unicode in Python extensions. This section is only about Python 3+ and C++11 or more.
12+
13+
Whilst Python is Unicode aware C++ is not, well C++11 added ``std::basic_string`` specialisations for 2 and 4 byte 'Unicode' characters but these are just containers, they have no real awareness of what they contain.
14+
15+
------------------------------------
16+
Basic Handling of Unicode
17+
------------------------------------
18+
19+
The task here is to:
20+
21+
#. Take any Python Unicode string as an argument.
22+
#. Convert it into an appropriate C++ container.
23+
#. Dump that C++ container out to ``std::cout``.
24+
#. Create and new Python Unicode string from that C++ container and return it.
25+
26+
This is just show that we can round-trip between the internal representations of the two languages.
27+
28+
Here is the despatch function that takes a single Unicode argument (note the ``"U"`` specification) and calls the appropriate handling function:
29+
30+
.. code-block:: cpp
31+
32+
/* Handler functions, defined later. */
33+
PyObject *unicode_1_to_string_and_back(PyObject *py_str);
34+
PyObject *unicode_2_to_string_and_back(PyObject *py_str);
35+
PyObject *unicode_4_to_string_and_back(PyObject *py_str);
36+
37+
PyObject*
38+
unicode_to_string_and_back(PyObject * /* module */, PyObject *args) {
39+
PyObject *py_str = NULL;
40+
PyObject *ret_val = NULL;
41+
if (PyArg_ParseTuple(args, "U", &py_str)) {
42+
switch (PyUnicode_KIND(py_str)) {
43+
case PyUnicode_1BYTE_KIND:
44+
ret_val = unicode_1_to_string_and_back(py_str);
45+
break;
46+
case PyUnicode_2BYTE_KIND:
47+
ret_val = unicode_2_to_string_and_back(py_str);
48+
break;
49+
case PyUnicode_4BYTE_KIND:
50+
ret_val = unicode_4_to_string_and_back(py_str);
51+
break;
52+
default:
53+
PyErr_Format(PyExc_ValueError,
54+
"In %s argument is not recognised as a Unicode 1, 2, 4 byte string",
55+
__FUNCTION__);
56+
break;
57+
}
58+
}
59+
return ret_val;
60+
}
61+
62+
The three handler functions are here, they use ``std::string``, ``std::u16string`` and ``std::u32string`` as appropriate:
63+
64+
.. code-block:: c
65+
66+
PyObject*
67+
unicode_1_to_string_and_back(PyObject *py_str) {
68+
assert(PyUnicode_KIND(py_str) == PyUnicode_1BYTE_KIND);
69+
std::string result = std::string((char*)PyUnicode_1BYTE_DATA(py_str));
70+
dump_string(result);
71+
return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND,
72+
result.c_str(),
73+
result.size());
74+
}
75+
76+
PyObject*
77+
unicode_2_to_string_and_back(PyObject *py_str) {
78+
assert(PyUnicode_KIND(py_str) == PyUnicode_2BYTE_KIND);
79+
// std::u16string is a std::basic_string<char16_t>
80+
std::u16string result = std::u16string((char16_t*)PyUnicode_2BYTE_DATA(py_str));
81+
dump_string(result);
82+
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
83+
result.c_str(),
84+
result.size());
85+
}
86+
87+
PyObject*
88+
unicode_4_to_string_and_back(PyObject *py_str) {
89+
assert(PyUnicode_KIND(py_str) == PyUnicode_4BYTE_KIND);
90+
// std::u32string is a std::basic_string<char32_t>
91+
std::u32string result = std::u32string((char32_t*)PyUnicode_4BYTE_DATA(py_str));
92+
dump_string(result);
93+
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
94+
result.c_str(),
95+
result.size());
96+
}
97+
98+
Each of these calls ``dump_string`` which is a template function:
99+
100+
.. code-block:: cpp
101+
102+
template <typename T>
103+
void dump_string(const std::basic_string<T> &str) {
104+
std::cout << "String size: " << str.size();
105+
std::cout << " word size: " << sizeof(T) << std::endl;
106+
for (size_t i = 0; i < str.size(); ++i) {
107+
std::cout << std::setfill('0');
108+
std::cout << "0x" << std::hex;
109+
std::cout << std::setw(2 * sizeof(T)) << static_cast<int>(str[i]);
110+
std::cout << " " << std::dec << std::setw(8) << static_cast<int>(str[i]);
111+
std::cout << std::setfill(' ');
112+
std::cout << " \"" << str[i] << "\""<< std::endl;
113+
}
114+
}
115+
116+
For completeness here is the module code that creates a ``cUnicode`` module with a single ``show()`` function:
117+
118+
.. code-block:: c
119+
120+
static PyMethodDef cUnicode_Methods[] = {
121+
{"show", (PyCFunction)unicode_to_string_and_back, METH_VARARGS,
122+
"Convert a Python unicode string to std::string and back."},
123+
{NULL, NULL, 0, NULL} /* Sentinel */
124+
};
125+
126+
static PyModuleDef cUnicodemodule = {
127+
PyModuleDef_HEAD_INIT,
128+
"cUnicode",
129+
"cUnicode works with unicode strings.",
130+
-1,
131+
cUnicode_Methods,
132+
NULL, NULL, NULL, NULL
133+
};
134+
135+
PyMODINIT_FUNC
136+
PyInit_cUnicode(void)
137+
{
138+
PyObject* m;
139+
140+
m = PyModule_Create(&cUnicodemodule);
141+
if (m == NULL)
142+
return NULL;
143+
return m;
144+
}
145+
146+
Here is an example of using this module:
147+
148+
.. code-block:: py
149+
150+
>>> import cUnicode
151+
>>> cUnicode.show('Hello')
152+
String size: 5 word size: 1
153+
0x00000048 72 "H"
154+
0x00000065 101 "e"
155+
0x0000006c 108 "l"
156+
0x0000006c 108 "l"
157+
0x0000006f 111 "o"
158+
'Hello'
159+
>>> s = "a\xac\u1234\u20ac\U00008000"
160+
>>> r = cUnicode.show(s)
161+
String size: 5 word size: 2
162+
0x00000061 97 "97"
163+
0x000000ac 172 "172"
164+
0x00001234 4660 "4660"
165+
0x000020ac 8364 "8364"
166+
0x00008000 32768 "32768"
167+
>>> r == s
168+
True
169+
>>> s = "a\xac\u1234\u20ac\U00018000"
170+
>>> r = cUnicode.show(s)
171+
String size: 5 word size: 4
172+
0x00000061 97 "97"
173+
0x000000ac 172 "172"
174+
0x00001234 4660 "4660"
175+
0x000020ac 8364 "8364"
176+
0x00018000 98304 "98304"
177+
>>> r == s
178+
True
179+
180+
-----------------------------------------------------------------------
181+
Working with ``bytes``, ``bytearray`` and UTF-8 Unicode Arguments
182+
-----------------------------------------------------------------------
183+
184+
It is fairly common to want to convert an argumennt that is ``bytes``, ``bytearray`` or UTF-8 to a ``std::string``. This function willl do just that:
185+
186+
.. code-block:: c
187+
188+
/* Convert a PyObject to a std::string and return 0 if succesful.
189+
* If py_str is Unicode than treat it as UTF-8.
190+
* This works with Python 2.7 and Python 3.4 onwards.
191+
*/
192+
int py_string_to_std_string(const PyObject *py_str,
193+
std::string &result,
194+
bool utf8_only=true) {
195+
result.clear();
196+
if (PyBytes_Check(py_str)) {
197+
result = std::string(PyBytes_AS_STRING(py_str));
198+
return 0;
199+
}
200+
if (PyByteArray_Check(py_str)) {
201+
result = std::string(PyByteArray_AS_STRING(py_str));
202+
return 0;
203+
}
204+
// Must be unicode then.
205+
if (! PyUnicode_Check(py_str)) {
206+
PyErr_Format(PyExc_ValueError,
207+
"In %s \"py_str\" failed PyUnicode_Check()",
208+
__FUNCTION__);
209+
return -1;
210+
}
211+
if (PyUnicode_READY(py_str)) {
212+
PyErr_Format(PyExc_ValueError,
213+
"In %s \"py_str\" failed PyUnicode_READY()",
214+
__FUNCTION__);
215+
return -2;
216+
}
217+
if (utf8_only && PyUnicode_KIND(py_str) != PyUnicode_1BYTE_KIND) {
218+
PyErr_Format(PyExc_ValueError,
219+
"In %s \"py_str\" not utf-8",
220+
__FUNCTION__);
221+
return -3;
222+
}
223+
// Python 3 and its minor versions (they vary)
224+
// const Py_UCS1 *pChrs = PyUnicode_1BYTE_DATA(pyStr);
225+
// result = std::string(reinterpret_cast<const char*>(pChrs));
226+
#if PY_MAJOR_VERSION >= 3
227+
result = std::string((char*)PyUnicode_1BYTE_DATA(py_str));
228+
#else
229+
// Nasty cast away constness because PyString_AsString takes non-const in Py2
230+
result = std::string((char*)PyString_AsString(const_cast<PyObject *>(py_str)));
231+
#endif
232+
return 0;
233+
}
234+
235+
And these three do the reverse:
236+
237+
.. code-block:: c
238+
239+
PyObject*
240+
std_string_to_py_bytes(const std::string &str) {
241+
return PyBytes_FromStringAndSize(str.c_str(), str.size());
242+
}
243+
244+
PyObject*
245+
std_string_to_py_bytearray(const std::string &str) {
246+
return PyByteArray_FromStringAndSize(str.c_str(), str.size());
247+
}
248+
249+
PyObject*
250+
std_string_to_py_utf8(const std::string &str) {
251+
// Equivelent to:
252+
// PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, str.c_str(), str.size());
253+
return PyUnicode_FromStringAndSize(str.c_str(), str.size());
254+
}
255+
256+

0 commit comments

Comments
 (0)