FreeBSD kernel libkern code
iconv_ucs.c
Go to the documentation of this file.
1 /*-
2  * Copyright (c) 2003, 2005 Ryuichiro Imura
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$BSDSUniX$");
29 
30 #include <sys/param.h>
31 #include <sys/kernel.h>
32 #include <sys/systm.h>
33 #include <sys/malloc.h>
34 #include <sys/iconv.h>
35 
36 #include "iconv_converter_if.h"
37 
38 /*
39  * "UCS" converter
40  */
41 
42 #define KICONV_UCS_COMBINE 0x1
43 #define KICONV_UCS_FROM_UTF8 0x2
44 #define KICONV_UCS_TO_UTF8 0x4
45 #define KICONV_UCS_FROM_LE 0x8
46 #define KICONV_UCS_TO_LE 0x10
47 #define KICONV_UCS_FROM_UTF16 0x20
48 #define KICONV_UCS_TO_UTF16 0x40
49 #define KICONV_UCS_UCS4 0x80
50 
51 #define ENCODING_UTF16 "UTF-16BE"
52 #define ENCODING_UTF8 "UTF-8"
53 
54 static struct {
55  const char *name;
57 } unicode_family[] = {
59  { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
63  { NULL, 0, 0 }
64 };
65 
66 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
67 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
68 static uint32_t encode_surrogate(uint32_t code);
69 static uint32_t decode_surrogate(const u_char *ucs);
70 
71 #ifdef MODULE_DEPEND
72 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
73 #endif
74 
75 /*
76  * UCS converter instance
77  */
78 struct iconv_ucs {
80  int convtype;
81  struct iconv_cspair * d_csp;
82  struct iconv_cspair * d_cspf;
83  void * f_ctp;
84  void * t_ctp;
85  void * ctype;
86 };
87 
88 static int
89 iconv_ucs_open(struct iconv_converter_class *dcp,
90  struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
91 {
92  struct iconv_ucs *dp;
93  int i;
94  const char *from, *to;
95 
96  dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
97  to = csp->cp_to;
98  from = cspf ? cspf->cp_from : csp->cp_from;
99 
100  dp->convtype = 0;
101 
102  if (cspf)
104  for (i = 0; unicode_family[i].name; i++) {
105  if (strcasecmp(from, unicode_family[i].name) == 0)
106  dp->convtype |= unicode_family[i].from_flag;
107  if (strcasecmp(to, unicode_family[i].name) == 0)
108  dp->convtype |= unicode_family[i].to_flag;
109  }
110  if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
111  dp->convtype |= KICONV_UCS_UCS4;
112  else
113  dp->convtype &= ~KICONV_UCS_UCS4;
114 
115  dp->f_ctp = dp->t_ctp = NULL;
116  if (dp->convtype & KICONV_UCS_COMBINE) {
117  if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
118  (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
119  iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
120  }
121  if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
122  (dp->convtype & KICONV_UCS_TO_LE) == 0) {
123  iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
124  }
125  }
126 
127  dp->ctype = NULL;
129  iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
130 
131  dp->d_csp = csp;
133  if (cspf) {
134  dp->d_cspf = cspf;
135  cspf->cp_refcount++;
136  } else
137  csp->cp_refcount++;
138  }
140  csp->cp_refcount++;
141  *dpp = (void*)dp;
142  return 0;
143 }
144 
145 static int
146 iconv_ucs_close(void *data)
147 {
148  struct iconv_ucs *dp = data;
149 
150  if (dp->f_ctp)
151  iconv_close(dp->f_ctp);
152  if (dp->t_ctp)
153  iconv_close(dp->t_ctp);
154  if (dp->ctype)
155  iconv_close(dp->ctype);
156  if (dp->d_cspf)
157  dp->d_cspf->cp_refcount--;
159  dp->d_csp->cp_refcount--;
161  dp->d_csp->cp_refcount--;
162  kobj_delete((struct kobj*)data, M_ICONV);
163  return 0;
164 }
165 
166 static int
167 iconv_ucs_conv(void *d2p, const char **inbuf,
168  size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
169  int convchar, int casetype)
170 {
171  struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
172  int ret = 0, i;
173  size_t in, on, ir, or, inlen, outlen, ucslen;
174  const char *src, *p;
175  char *dst;
176  u_char ucs[4], *q;
177  uint32_t code;
178 
179  if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
180  return 0;
181  ir = in = *inbytesleft;
182  or = on = *outbytesleft;
183  src = *inbuf;
184  dst = *outbuf;
185 
186  while (ir > 0 && or > 0) {
187 
188  /*
189  * The first half of conversion.
190  * (convert any code into ENCODING_UNICODE)
191  */
192  code = 0;
193  p = src;
194  if (dp->convtype & KICONV_UCS_FROM_UTF8) {
195  /* convert UTF-8 to ENCODING_UNICODE */
196  inlen = 0;
197  code = utf8_to_ucs4(p, &inlen, ir);
198  if (code == 0) {
199  ret = -1;
200  break;
201  }
202 
203  if (casetype == KICONV_FROM_LOWER && dp->ctype) {
204  code = towlower(code, dp->ctype);
205  } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
206  code = towupper(code, dp->ctype);
207  }
208 
209  if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
210  /* reserved for utf-16 surrogate pair */
211  /* invalid unicode */
212  ret = -1;
213  break;
214  }
215 
216  if (inlen == 4) {
217  if (dp->convtype & KICONV_UCS_UCS4) {
218  ucslen = 4;
219  code = encode_surrogate(code);
220  } else {
221  /* can't handle with ucs-2 */
222  ret = -1;
223  break;
224  }
225  } else {
226  ucslen = 2;
227  }
228 
229  /* save UCS-4 into ucs[] */
230  for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
231  *q++ = (code >> (i << 3)) & 0xff;
232 
233  } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
234  /* convert local code to ENCODING_UNICODE */
235  ucslen = 4;
236  inlen = ir;
237  q = ucs;
238  ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
239  &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
240  if (ret)
241  break;
242  inlen = ir - inlen;
243  ucslen = 4 - ucslen;
244 
245  } else {
246  /* src code is a proper subset of ENCODING_UNICODE */
247  q = ucs;
248  if (dp->convtype & KICONV_UCS_FROM_LE) {
249  *q = *(p + 1);
250  *(q + 1) = *p;
251  p += 2;
252  } else {
253  *q = *p++;
254  *(q + 1) = *p++;
255  }
256  if ((*q & 0xfc) == 0xd8) {
257  if (dp->convtype & KICONV_UCS_UCS4 &&
259  inlen = ucslen = 4;
260  } else {
261  /* invalid unicode */
262  ret = -1;
263  break;
264  }
265  } else {
266  inlen = ucslen = 2;
267  }
268  if (ir < inlen) {
269  ret = -1;
270  break;
271  }
272  if (ucslen == 4) {
273  q += 2;
274  if (dp->convtype & KICONV_UCS_FROM_LE) {
275  *q = *(p + 1);
276  *(q + 1) = *p;
277  } else {
278  *q = *p++;
279  *(q + 1) = *p;
280  }
281  if ((*q & 0xfc) != 0xdc) {
282  /* invalid unicode */
283  ret = -1;
284  break;
285  }
286  }
287  }
288 
289  /*
290  * The second half of conversion.
291  * (convert ENCODING_UNICODE into any code)
292  */
293  p = ucs;
294  if (dp->convtype & KICONV_UCS_TO_UTF8) {
295  q = (u_char *)dst;
296  if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
297  /* decode surrogate pair */
298  code = decode_surrogate(p);
299  } else {
300  code = (ucs[0] << 8) | ucs[1];
301  }
302 
303  if (casetype == KICONV_LOWER && dp->ctype) {
304  code = towlower(code, dp->ctype);
305  } else if (casetype == KICONV_UPPER && dp->ctype) {
306  code = towupper(code, dp->ctype);
307  }
308 
309  outlen = 0;
310  if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
311  ret = -1;
312  break;
313  }
314 
315  src += inlen;
316  ir -= inlen;
317  dst += outlen;
318  or -= outlen;
319 
320  } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
321  ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
322  &or, casetype & (KICONV_LOWER | KICONV_UPPER));
323  if (ret)
324  break;
325 
326  src += inlen;
327  ir -= inlen;
328 
329  } else {
330  /* dst code is a proper subset of ENCODING_UNICODE */
331  if (or < ucslen) {
332  ret = -1;
333  break;
334  }
335  src += inlen;
336  ir -= inlen;
337  or -= ucslen;
338  if (dp->convtype & KICONV_UCS_TO_LE) {
339  *dst++ = *(p + 1);
340  *dst++ = *p;
341  p += 2;
342  } else {
343  *dst++ = *p++;
344  *dst++ = *p++;
345  }
346  if (ucslen == 4) {
347  if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
348  (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
349  ret = -1;
350  break;
351  }
352  if (dp->convtype & KICONV_UCS_TO_LE) {
353  *dst++ = *(p + 1);
354  *dst++ = *p;
355  } else {
356  *dst++ = *p++;
357  *dst++ = *p;
358  }
359  }
360  }
361 
362  if (convchar == 1)
363  break;
364  }
365 
366  *inbuf += in - ir;
367  *outbuf += on - or;
368  *inbytesleft -= in - ir;
369  *outbytesleft -= on - or;
370  return (ret);
371 }
372 
373 static int
374 iconv_ucs_init(struct iconv_converter_class *dcp)
375 {
376  int error;
377 
378  error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
379  if (error)
380  return (error);
381  error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
382  if (error)
383  return (error);
384  return (0);
385 }
386 
387 static int
388 iconv_ucs_done(struct iconv_converter_class *dcp)
389 {
390  return (0);
391 }
392 
393 static const char *
394 iconv_ucs_name(struct iconv_converter_class *dcp)
395 {
396  return (ENCODING_UNICODE);
397 }
398 
399 static kobj_method_t iconv_ucs_methods[] = {
400  KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
401  KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
402  KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
403  KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
404  KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
405  KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
406  {0, 0}
407 };
408 
409 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
410 
411 static uint32_t
412 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
413 {
414  size_t i, w = 0;
415  uint32_t ucs4 = 0;
416 
417  /*
418  * get leading 1 byte from utf-8
419  */
420  if ((*src & 0x80) == 0) {
421  /*
422  * leading 1 bit is "0"
423  * utf-8: 0xxxxxxx
424  * ucs-4: 00000000 00000000 00000000 0xxxxxxx
425  */
426  w = 1;
427  /* get trailing 7 bits */
428  ucs4 = *src & 0x7f;
429  } else if ((*src & 0xe0) == 0xc0) {
430  /*
431  * leading 3 bits are "110"
432  * utf-8: 110xxxxx 10yyyyyy
433  * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
434  */
435  w = 2;
436  /* get trailing 5 bits */
437  ucs4 = *src & 0x1f;
438  } else if ((*src & 0xf0) == 0xe0) {
439  /*
440  * leading 4 bits are "1110"
441  * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
442  * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
443  */
444  w = 3;
445  /* get trailing 4 bits */
446  ucs4 = *src & 0x0f;
447  } else if ((*src & 0xf8) == 0xf0) {
448  /*
449  * leading 5 bits are "11110"
450  * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
451  * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
452  */
453  w = 4;
454  /* get trailing 3 bits */
455  ucs4 = *src & 0x07;
456  } else {
457  /* out of utf-16 range or having illegal bits */
458  return (0);
459  }
460  if (w == 0)
461  return (0);
462 
463  if (srclen < w)
464  return (0);
465 
466  /*
467  * get left parts from utf-8
468  */
469  for (i = 1 ; i < w ; i++) {
470  if ((*(src + i) & 0xc0) != 0x80) {
471  /* invalid: leading 2 bits are not "10" */
472  return (0);
473  }
474  /* concatenate trailing 6 bits into ucs4 */
475  ucs4 <<= 6;
476  ucs4 |= *(src + i) & 0x3f;
477  }
478 
479  *utf8width = w;
480  return (ucs4);
481 }
482 
483 static u_char *
484 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
485 {
486  u_char lead, *p;
487  size_t i, w;
488 
489  /*
490  * determine utf-8 width and leading bits
491  */
492  if (ucs4 < 0x80) {
493  w = 1;
494  lead = 0; /* "0" */
495  } else if (ucs4 < 0x800) {
496  w = 2;
497  lead = 0xc0; /* "11" */
498  } else if (ucs4 < 0x10000) {
499  w = 3;
500  lead = 0xe0; /* "111" */
501  } else if (ucs4 < 0x200000) {
502  w = 4;
503  lead = 0xf0; /* "1111" */
504  } else {
505  return (NULL);
506  }
507 
508  if (dstlen < w)
509  return (NULL);
510 
511  /*
512  * construct utf-8
513  */
514  p = dst;
515  for (i = w - 1 ; i >= 1 ; i--) {
516  /* get trailing 6 bits and put it with leading bit as "1" */
517  *(p + i) = (ucs4 & 0x3f) | 0x80;
518  ucs4 >>= 6;
519  }
520  *p = ucs4 | lead;
521 
522  *utf8width = w;
523 
524  return (p);
525 }
526 
527 static uint32_t
528 encode_surrogate(register uint32_t code)
529 {
530  return ((((code - 0x10000) << 6) & 0x3ff0000) |
531  ((code - 0x10000) & 0x3ff) | 0xd800dc00);
532 }
533 
534 static uint32_t
535 decode_surrogate(register const u_char *ucs)
536 {
537  return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
538  ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
539 }
540 
static int iconv_ucs_init(struct iconv_converter_class *dcp)
Definition: iconv_ucs.c:374
const char ** inbuf
struct iconv_cspair * d_csp
Definition: iconv_ucs.c:81
int from_flag
Definition: iconv_ucs.c:56
#define KICONV_UCS_COMBINE
Definition: iconv_ucs.c:42
static int iconv_ucs_conv(void *d2p, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int convchar, int casetype)
Definition: iconv_ucs.c:167
static struct @0 unicode_family[]
static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
Definition: iconv_ucs.c:412
void * ctype
Definition: iconv_ucs.c:85
__FBSDID("$BSDSUniX$")
#define KICONV_UCS_FROM_UTF16
Definition: iconv_ucs.c:47
int iconv_close(void *handle)
Definition: iconv.c:273
int convtype
Definition: iconv_ucs.c:80
static int iconv_ucs_close(void *data)
Definition: iconv_ucs.c:146
int iconv_open(const char *to, const char *from, void **handle)
Definition: iconv.c:236
#define ENCODING_UTF8
Definition: iconv_ucs.c:52
static u_char * ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
Definition: iconv_ucs.c:484
int iconv_add(const char *converter, const char *to, const char *from)
Definition: iconv.c:383
static kobj_method_t iconv_ucs_methods[]
Definition: iconv_ucs.c:399
int convchar
int towlower(int c, void *handle)
Definition: iconv.c:307
#define KICONV_UCS_UCS4
Definition: iconv_ucs.c:49
#define KICONV_UCS_FROM_UTF8
Definition: iconv_ucs.c:43
#define ENCODING_UTF16
Definition: iconv_ucs.c:51
int to_flag
Definition: iconv_ucs.c:56
int casetype
struct iconv_cspair * d_cspf
Definition: iconv_ucs.c:82
#define KICONV_UCS_TO_LE
Definition: iconv_ucs.c:46
void * f_ctp
Definition: iconv_ucs.c:83
int towupper(int c, void *handle)
Definition: iconv.c:313
char ** outbuf
static uint32_t encode_surrogate(uint32_t code)
int strcasecmp(const char *s1, const char *s2)
Definition: strcasecmp.c:42
static int iconv_ucs_done(struct iconv_converter_class *dcp)
Definition: iconv_ucs.c:388
const char * name
Definition: iconv_ucs.c:55
size_t * outbytesleft
int iconv_convchr_case(void *handle, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int casetype)
Definition: iconv.c:300
void * t_ctp
Definition: iconv_ucs.c:84
#define KICONV_UCS_FROM_LE
Definition: iconv_ucs.c:45
#define KICONV_UCS_TO_UTF16
Definition: iconv_ucs.c:48
static int iconv_ucs_open(struct iconv_converter_class *dcp, struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
Definition: iconv_ucs.c:89
#define KICONV_UCS_TO_UTF8
Definition: iconv_ucs.c:44
int strcmp(const char *s1, const char *s2)
Definition: strcmp.c:42
size_t * inbytesleft
static uint32_t decode_surrogate(const u_char *ucs)
KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs))
static const char * iconv_ucs_name(struct iconv_converter_class *dcp)
Definition: iconv_ucs.c:394