001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 * 019 */ 020package org.apache.directory.api.util; 021 022 023import java.io.IOException; 024import java.io.ObjectInput; 025import java.io.ObjectOutput; 026 027 028/** 029 * Various unicode manipulation methods that are more efficient then chaining 030 * operations: all is done in the same buffer without creating a bunch of string 031 * objects. 032 * 033 * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a> 034 */ 035public final class Unicode 036{ 037 private static final int UTF8_MULTI_BYTES_MASK = 0x0080; 038 private static final int UTF8_TWO_BYTES_MASK = 0x00E0; 039 private static final int UTF8_TWO_BYTES = 0x00C0; 040 private static final int UTF8_THREE_BYTES_MASK = 0x00F0; 041 private static final int UTF8_THREE_BYTES = 0x00E0; 042 private static final int UTF8_FOUR_BYTES_MASK = 0x00F8; 043 private static final int UTF8_FOUR_BYTES = 0x00F0; 044 private static final int UTF8_FIVE_BYTES_MASK = 0x00FC; 045 private static final int UTF8_FIVE_BYTES = 0x00F8; 046 private static final int UTF8_SIX_BYTES_MASK = 0x00FE; 047 private static final int UTF8_SIX_BYTES = 0x00FC; 048 049 /** %01-%27 %2B-%5B %5D-%7F */ 050 private static final boolean[] UNICODE_SUBSET = 051 { 052 // '\0' 053 false, true, true, true, true, true, true, true, 054 true, true, true, true, true, true, true, true, 055 true, true, true, true, true, true, true, true, 056 true, true, true, true, true, true, true, true, 057 true, true, true, true, true, true, true, true, 058 // '(', ')', '*' 059 false, false, false, true, true, true, true, true, 060 true, true, true, true, true, true, true, true, 061 true, true, true, true, true, true, true, true, 062 true, true, true, true, true, true, true, true, 063 true, true, true, true, true, true, true, true, 064 true, true, true, true, true, true, true, true, 065 // '\' 066 true, true, true, true, false, true, true, true, 067 true, true, true, true, true, true, true, true, 068 true, true, true, true, true, true, true, true, 069 true, true, true, true, true, true, true, true, 070 true, true, true, true, true, true, true, true, 071 }; 072 private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80; 073 private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800; 074 private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000; 075 private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000; 076 private static final int CHAR_FIVE_BYTES_MASK = 0xFC000000; 077 private static final int CHAR_SIX_BYTES_MASK = 0x80000000; 078 079 /** 080 * Count the number of bytes needed to return an Unicode char. This can be 081 * from 1 to 6. 082 * 083 * @param bytes The bytes to read 084 * @param pos Position to start counting. It must be a valid start of a 085 * encoded char ! 086 * @return The number of bytes to create a char, or -1 if the encoding is 087 * wrong. TODO : Should stop after the third byte, as a char is only 088 * 2 bytes long. 089 */ 090 public static int countBytesPerChar( byte[] bytes, int pos ) 091 { 092 if ( bytes == null ) 093 { 094 return -1; 095 } 096 097 if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 ) 098 { 099 return 1; 100 } 101 else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES ) 102 { 103 return 2; 104 } 105 else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES ) 106 { 107 return 3; 108 } 109 else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES ) 110 { 111 return 4; 112 } 113 else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES ) 114 { 115 return 5; 116 } 117 else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES ) 118 { 119 return 6; 120 } 121 else 122 { 123 return -1; 124 } 125 } 126 127 128 /** 129 * Return the Unicode char which is coded in the bytes at position 0. 130 * 131 * @param bytes The byte[] represntation of an Unicode string. 132 * @return The first char found. 133 */ 134 public static char bytesToChar( byte[] bytes ) 135 { 136 return bytesToChar( bytes, 0 ); 137 } 138 139 140 /** 141 * Return the Unicode char which is coded in the bytes at the given 142 * position. 143 * 144 * @param bytes The byte[] represntation of an Unicode string. 145 * @param pos The current position to start decoding the char 146 * @return The decoded char, or -1 if no char can be decoded TODO : Should 147 * stop after the third byte, as a char is only 2 bytes long. 148 */ 149 public static char bytesToChar( byte[] bytes, int pos ) 150 { 151 if ( bytes == null ) 152 { 153 return ( char ) -1; 154 } 155 156 if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 ) 157 { 158 return ( char ) bytes[pos]; 159 } 160 else 161 { 162 if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES ) 163 { 164 // Two bytes char 165 // 110x-xxyy 10zz-zzzz -> 0000-0xxx yyzz-zzzz 166 return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) ); 167 } 168 else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES ) 169 { 170 // Three bytes char 171 // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-xxxx yyzz-zzzz (FF FF) 172 return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 ) 173 + ( ( bytes[pos + 1] & 0x3C ) << 6 ) 174 + ( ( bytes[pos + 1] & 0x03 ) << 6 ) 175 + ( bytes[pos + 2] & 0x3F ) 176 ); 177 } 178 else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES ) 179 { 180 // Four bytes char 181 return ( char ) ( 182 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF) 183 ( ( bytes[pos] & 0x07 ) << 18 ) 184 + ( ( bytes[pos + 1] & 0x30 ) << 16 ) 185 + ( ( bytes[pos + 1] & 0x0F ) << 12 ) 186 + ( ( bytes[pos + 2] & 0x3C ) << 6 ) 187 + ( ( bytes[pos + 2] & 0x03 ) << 6 ) 188 + ( bytes[pos + 3] & 0x3F ) 189 ); 190 } 191 else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES ) 192 { 193 // Five bytes char 194 return ( char ) ( 195 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 196 // 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF) 197 ( ( bytes[pos] & 0x03 ) << 24 ) 198 + ( ( bytes[pos + 1] & 0x3F ) << 18 ) 199 + ( ( bytes[pos + 2] & 0x30 ) << 12 ) 200 + ( ( bytes[pos + 2] & 0x0F ) << 12 ) 201 + ( ( bytes[pos + 3] & 0x3C ) << 6 ) 202 + ( ( bytes[pos + 3] & 0x03 ) << 6 ) 203 + ( bytes[pos + 4] & 0x3F ) 204 ); 205 } 206 else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES ) 207 { 208 // Six bytes char 209 return ( char ) ( 210 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz 211 // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF) 212 ( ( bytes[pos] & 0x01 ) << 30 ) 213 + ( ( bytes[pos + 1] & 0x3F ) << 24 ) 214 + ( ( bytes[pos + 2] & 0x3F ) << 18 ) 215 + ( ( bytes[pos + 3] & 0x30 ) << 12 ) 216 + ( ( bytes[pos + 3] & 0x0F ) << 12 ) 217 + ( ( bytes[pos + 4] & 0x3C ) << 6 ) 218 + ( ( bytes[pos + 4] & 0x03 ) << 6 ) 219 + ( bytes[pos + 5] & 0x3F ) 220 ); 221 } 222 else 223 { 224 return ( char ) -1; 225 } 226 } 227 } 228 229 230 /** 231 * Return the number of bytes that hold an Unicode char. 232 * 233 * @param car The character to be decoded 234 * @return The number of bytes to hold the char. TODO : Should stop after 235 * the third byte, as a char is only 2 bytes long. 236 */ 237 public static int countNbBytesPerChar( char car ) 238 { 239 if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 ) 240 { 241 return 1; 242 } 243 else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 ) 244 { 245 return 2; 246 } 247 else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 ) 248 { 249 return 3; 250 } 251 else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 ) 252 { 253 return 4; 254 } 255 else if ( ( car & CHAR_FIVE_BYTES_MASK ) == 0 ) 256 { 257 return 5; 258 } 259 else if ( ( car & CHAR_SIX_BYTES_MASK ) == 0 ) 260 { 261 return 6; 262 } 263 else 264 { 265 return -1; 266 } 267 } 268 269 270 /** 271 * Count the number of bytes included in the given char[]. 272 * 273 * @param chars The char array to decode 274 * @return The number of bytes in the char array 275 */ 276 public static int countBytes( char[] chars ) 277 { 278 if ( chars == null ) 279 { 280 return 0; 281 } 282 283 int nbBytes = 0; 284 int currentPos = 0; 285 286 while ( currentPos < chars.length ) 287 { 288 int nbb = countNbBytesPerChar( chars[currentPos] ); 289 290 // If the number of bytes necessary to encode a character is 291 // above 3, we will need two UTF-16 chars 292 currentPos += ( nbb < 4 ? 1 : 2 ); 293 nbBytes += nbb; 294 } 295 296 return nbBytes; 297 } 298 299 300 /** 301 * Count the number of chars included in the given byte[]. 302 * 303 * @param bytes The byte array to decode 304 * @return The number of char in the byte array 305 */ 306 public static int countChars( byte[] bytes ) 307 { 308 if ( bytes == null ) 309 { 310 return 0; 311 } 312 313 int nbChars = 0; 314 int currentPos = 0; 315 316 while ( currentPos < bytes.length ) 317 { 318 currentPos += countBytesPerChar( bytes, currentPos ); 319 nbChars++; 320 } 321 322 return nbChars; 323 } 324 325 326 /** 327 * Return the Unicode char which is coded in the bytes at the given 328 * position. 329 * 330 * @param car The character to be transformed to an array of bytes 331 * 332 * @return The byte array representing the char 333 * 334 * TODO : Should stop after the third byte, as a char is only 2 bytes long. 335 */ 336 public static byte[] charToBytes( char car ) 337 { 338 byte[] bytes = new byte[countNbBytesPerChar( car )]; 339 340 if ( car <= 0x7F ) 341 { 342 // Single byte char 343 bytes[0] = ( byte ) car; 344 return bytes; 345 } 346 else if ( car <= 0x7FF ) 347 { 348 // two bytes char 349 bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) ); 350 bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); 351 } 352 else 353 { 354 // Three bytes char 355 bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) ); 356 bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) ); 357 bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); 358 } 359 360 return bytes; 361 } 362 363 364 /** 365 * Check if the current char is in the unicodeSubset : all chars but 366 * '\0', '(', ')', '*' and '\' 367 * 368 * @param str The string to check 369 * @param pos Position of the current char 370 * @return True if the current char is in the unicode subset 371 */ 372 public static boolean isUnicodeSubset( String str, int pos ) 373 { 374 if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) ) 375 { 376 return false; 377 } 378 379 char c = str.charAt( pos ); 380 381 return ( ( c > 127 ) || UNICODE_SUBSET[c] ); 382 } 383 384 385 /** 386 * Check if the current char is in the unicodeSubset : all chars but 387 * '\0', '(', ')', '*' and '\' 388 * 389 * @param c The char to check 390 * @return True if the current char is in the unicode subset 391 */ 392 public static boolean isUnicodeSubset( char c ) 393 { 394 return ( ( c > 127 ) || UNICODE_SUBSET[c] ); 395 } 396 397 398 /** 399 * Check if the current byte is in the unicodeSubset : all chars but 400 * '\0', '(', ')', '*' and '\' 401 * 402 * @param b The byte to check 403 * @return True if the current byte is in the unicode subset 404 */ 405 public static boolean isUnicodeSubset( byte b ) 406 { 407 return ( ( b < 0 ) || ( b > 127 ) || UNICODE_SUBSET[b] ); 408 } 409 410 411 /** 412 * 413 * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation 414 * of every character in the string str. If str is null, the string value 'null' is written with a length of 0 415 * instead of throwing an NullPointerException. Each character in the string s is converted to a group of one, 416 * two, or three bytes, depending on the value of the character. 417 * 418 * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is 419 * written in the length information (four bytes (writeInt)) and the string is split into smaller parts 420 * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes 421 * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at 422 * once. 423 * 424 * See also {@link java.io.DataOutput#writeUTF(String)}. 425 * 426 * @param objectOutput The objectOutput to write to 427 * @param str The value to write 428 * @throws java.io.IOException If the value can't be written to the file 429 */ 430 public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException 431 { 432 // Write a 'null' string 433 if ( str == null ) 434 { 435 objectOutput.writeInt( 0 ); 436 objectOutput.writeUTF( "null" ); 437 } 438 else 439 { 440 // Write length of string 441 objectOutput.writeInt( str.length() ); 442 443 StringBuffer strBuf = new StringBuffer( str ); 444 445 // Write the string in portions not larger than 21845 characters 446 while ( strBuf != null ) 447 { 448 if ( strBuf.length() < 21845 ) 449 { 450 objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) ); 451 strBuf = null; 452 } 453 else 454 { 455 objectOutput.writeUTF( strBuf.substring( 0, 21845 ) ); 456 strBuf.delete( 0, 21845 ); 457 } 458 } 459 } 460 } 461 462 463 /** 464 * 465 * Reads in a string that has been encoded using a modified UTF-8 format. The general contract of readUTF is 466 * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of 467 * characters is then returned as a String. 468 * 469 * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner 470 * of the readUnsignedShort method . This integer value is called the UTF length and specifies the number of 471 * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The 472 * length of each group is computed from the value of the first byte of the group. The byte following a group, if 473 * any, is the first byte of the next group. 474 * 475 *See also {@link java.io.DataInput#readUTF()}. 476 * 477 * @param objectInput The objectInput to read from 478 * @return The read string 479 * @throws java.io.IOException If the value can't be read 480 */ 481 public static String readUTF( ObjectInput objectInput ) throws IOException 482 { 483 StringBuffer strBuf = null; 484 485 // Read length of the string 486 int strLength = objectInput.readInt(); 487 488 // Start reading the string 489 strBuf = new StringBuffer( objectInput.readUTF() ); 490 491 if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) ) 492 { 493 // The special case of a 'null' string 494 return null; 495 } 496 else 497 { 498 while ( strLength > strBuf.length() ) 499 { 500 strBuf.append( objectInput.readUTF() ); 501 } 502 return strBuf.toString(); 503 } 504 } 505 506 507 private Unicode() 508 { 509 } 510}