001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one
003 *  or more contributor license agreements.  See the NOTICE file
004 *  distributed with this work for additional information
005 *  regarding copyright ownership.  The ASF licenses this file
006 *  to you under the Apache License, Version 2.0 (the
007 *  "License"); you may not use this file except in compliance
008 *  with the License.  You may obtain a copy of the License at
009 * 
010 *    http://www.apache.org/licenses/LICENSE-2.0
011 * 
012 *  Unless required by applicable law or agreed to in writing,
013 *  software distributed under the License is distributed on an
014 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 *  KIND, either express or implied.  See the License for the
016 *  specific language governing permissions and limitations
017 *  under the License.
018 * 
019 */
020package org.apache.directory.api.util;
021
022
023import java.io.IOException;
024import java.io.ObjectInput;
025import java.io.ObjectOutput;
026
027
028/**
029 * Various unicode manipulation methods that are more efficient then chaining
030 * operations: all is done in the same buffer without creating a bunch of string
031 * objects.
032 * 
033 * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
034 */
035public final class Unicode
036{
037    private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
038    private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
039    private static final int UTF8_TWO_BYTES = 0x00C0;
040    private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
041    private static final int UTF8_THREE_BYTES = 0x00E0;
042    private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
043    private static final int UTF8_FOUR_BYTES = 0x00F0;
044    private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
045    private static final int UTF8_FIVE_BYTES = 0x00F8;
046    private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
047    private static final int UTF8_SIX_BYTES = 0x00FC;
048
049    /** %01-%27 %2B-%5B %5D-%7F */
050    private static final boolean[] UNICODE_SUBSET =
051        {
052            // '\0'
053            false, true,  true,  true,  true,  true,  true,  true, 
054            true,  true,  true,  true,  true,  true,  true,  true,
055            true,  true,  true,  true,  true,  true,  true,  true,
056            true,  true,  true,  true,  true,  true,  true,  true,
057            true,  true,  true,  true,  true,  true,  true,  true,
058            // '(', ')', '*'
059            false, false, false, true,  true,  true,  true,  true, 
060            true,  true,  true,  true,  true,  true,  true,  true,
061            true,  true,  true,  true,  true,  true,  true,  true,
062            true,  true,  true,  true,  true,  true,  true,  true,
063            true,  true,  true,  true,  true,  true,  true,  true,
064            true,  true,  true,  true,  true,  true,  true,  true,
065            // '\'
066            true,  true,  true,  true,  false, true,  true,  true,
067            true,  true,  true,  true,  true,  true,  true,  true,
068            true,  true,  true,  true,  true,  true,  true,  true,
069            true,  true,  true,  true,  true,  true,  true,  true,
070            true,  true,  true,  true,  true,  true,  true,  true,
071        };
072    private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80;
073    private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800;
074    private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000;
075    private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000;
076    private static final int CHAR_FIVE_BYTES_MASK = 0xFC000000;
077    private static final int CHAR_SIX_BYTES_MASK = 0x80000000;
078
079    /**
080     * Count the number of bytes needed to return an Unicode char. This can be
081     * from 1 to 6.
082     *
083     * @param bytes The bytes to read
084     * @param pos Position to start counting. It must be a valid start of a
085     *            encoded char !
086     * @return The number of bytes to create a char, or -1 if the encoding is
087     *         wrong. TODO : Should stop after the third byte, as a char is only
088     *         2 bytes long.
089     */
090    public static int countBytesPerChar( byte[] bytes, int pos )
091    {
092        if ( bytes == null )
093        {
094            return -1;
095        }
096
097        if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
098        {
099            return 1;
100        }
101        else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
102        {
103            return 2;
104        }
105        else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
106        {
107            return 3;
108        }
109        else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
110        {
111            return 4;
112        }
113        else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
114        {
115            return 5;
116        }
117        else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
118        {
119            return 6;
120        }
121        else
122        {
123            return -1;
124        }
125    }
126
127
128    /**
129     * Return the Unicode char which is coded in the bytes at position 0.
130     *
131     * @param bytes The byte[] represntation of an Unicode string.
132     * @return The first char found.
133     */
134    public static char bytesToChar( byte[] bytes )
135    {
136        return bytesToChar( bytes, 0 );
137    }
138
139
140    /**
141     * Return the Unicode char which is coded in the bytes at the given
142     * position.
143     *
144     * @param bytes The byte[] represntation of an Unicode string.
145     * @param pos The current position to start decoding the char
146     * @return The decoded char, or -1 if no char can be decoded TODO : Should
147     *         stop after the third byte, as a char is only 2 bytes long.
148     */
149    public static char bytesToChar( byte[] bytes, int pos )
150    {
151        if ( bytes == null )
152        {
153            return ( char ) -1;
154        }
155
156        if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
157        {
158            return ( char ) bytes[pos];
159        }
160        else
161        {
162            if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
163            {
164                // Two bytes char
165                // 110x-xxyy 10zz-zzzz -> 0000-0xxx yyzz-zzzz
166                return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) );
167            }
168            else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
169            {
170                // Three bytes char
171                // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-xxxx yyzz-zzzz (FF FF)
172                return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 )
173                    + ( ( bytes[pos + 1] & 0x3C ) << 6 )
174                    + ( ( bytes[pos + 1] & 0x03 ) << 6 )
175                    + ( bytes[pos + 2] & 0x3F )
176                );
177            }
178            else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
179            {
180                // Four bytes char
181                return ( char ) (
182                // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
183                ( ( bytes[pos] & 0x07 ) << 18 )
184                    + ( ( bytes[pos + 1] & 0x30 ) << 16 )
185                    + ( ( bytes[pos + 1] & 0x0F ) << 12 )
186                    + ( ( bytes[pos + 2] & 0x3C ) << 6 )
187                    + ( ( bytes[pos + 2] & 0x03 ) << 6 )
188                    + ( bytes[pos + 3] & 0x3F )
189                );
190            }
191            else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
192            {
193                // Five bytes char
194                return ( char ) (
195                // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
196                // 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
197                ( ( bytes[pos] & 0x03 ) << 24 )
198                    + ( ( bytes[pos + 1] & 0x3F ) << 18 )
199                    + ( ( bytes[pos + 2] & 0x30 ) << 12 )
200                    + ( ( bytes[pos + 2] & 0x0F ) << 12 )
201                    + ( ( bytes[pos + 3] & 0x3C ) << 6 )
202                    + ( ( bytes[pos + 3] & 0x03 ) << 6 )
203                    + ( bytes[pos + 4] & 0x3F )
204                );
205            }
206            else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
207            {
208                // Six bytes char
209                return ( char ) (
210                // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
211                // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
212                ( ( bytes[pos] & 0x01 ) << 30 )
213                    + ( ( bytes[pos + 1] & 0x3F ) << 24 )
214                    + ( ( bytes[pos + 2] & 0x3F ) << 18 )
215                    + ( ( bytes[pos + 3] & 0x30 ) << 12 )
216                    + ( ( bytes[pos + 3] & 0x0F ) << 12 )
217                    + ( ( bytes[pos + 4] & 0x3C ) << 6 )
218                    + ( ( bytes[pos + 4] & 0x03 ) << 6 )
219                    + ( bytes[pos + 5] & 0x3F )
220                );
221            }
222            else
223            {
224                return ( char ) -1;
225            }
226        }
227    }
228
229
230    /**
231     * Return the number of bytes that hold an Unicode char.
232     *
233     * @param car The character to be decoded
234     * @return The number of bytes to hold the char. TODO : Should stop after
235     *         the third byte, as a char is only 2 bytes long.
236     */
237    public static int countNbBytesPerChar( char car )
238    {
239        if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 )
240        {
241            return 1;
242        }
243        else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 )
244        {
245            return 2;
246        }
247        else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 )
248        {
249            return 3;
250        }
251        else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 )
252        {
253            return 4;
254        }
255        else if ( ( car & CHAR_FIVE_BYTES_MASK ) == 0 )
256        {
257            return 5;
258        }
259        else if ( ( car & CHAR_SIX_BYTES_MASK ) == 0 )
260        {
261            return 6;
262        }
263        else
264        {
265            return -1;
266        }
267    }
268
269
270    /**
271     * Count the number of bytes included in the given char[].
272     *
273     * @param chars The char array to decode
274     * @return The number of bytes in the char array
275     */
276    public static int countBytes( char[] chars )
277    {
278        if ( chars == null )
279        {
280            return 0;
281        }
282
283        int nbBytes = 0;
284        int currentPos = 0;
285
286        while ( currentPos < chars.length )
287        {
288            int nbb = countNbBytesPerChar( chars[currentPos] );
289
290            // If the number of bytes necessary to encode a character is
291            // above 3, we will need two UTF-16 chars
292            currentPos += ( nbb < 4 ? 1 : 2 );
293            nbBytes += nbb;
294        }
295
296        return nbBytes;
297    }
298
299
300    /**
301     * Count the number of chars included in the given byte[].
302     *
303     * @param bytes The byte array to decode
304     * @return The number of char in the byte array
305     */
306    public static int countChars( byte[] bytes )
307    {
308        if ( bytes == null )
309        {
310            return 0;
311        }
312
313        int nbChars = 0;
314        int currentPos = 0;
315
316        while ( currentPos < bytes.length )
317        {
318            currentPos += countBytesPerChar( bytes, currentPos );
319            nbChars++;
320        }
321
322        return nbChars;
323    }
324
325
326    /**
327     * Return the Unicode char which is coded in the bytes at the given
328     * position.
329     *
330     * @param car The character to be transformed to an array of bytes
331     *
332     * @return The byte array representing the char
333     *
334     * TODO : Should stop after the third byte, as a char is only 2 bytes long.
335     */
336    public static byte[] charToBytes( char car )
337    {
338        byte[] bytes = new byte[countNbBytesPerChar( car )];
339
340        if ( car <= 0x7F )
341        {
342            // Single byte char
343            bytes[0] = ( byte ) car;
344            return bytes;
345        }
346        else if ( car <= 0x7FF )
347        {
348            // two bytes char
349            bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
350            bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
351        }
352        else
353        {
354            // Three bytes char
355            bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
356            bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
357            bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
358        }
359
360        return bytes;
361    }
362
363
364    /**
365     * Check if the current char is in the unicodeSubset : all chars but
366     * '\0', '(', ')', '*' and '\'
367     *
368     * @param str The string to check
369     * @param pos Position of the current char
370     * @return True if the current char is in the unicode subset
371     */
372    public static boolean isUnicodeSubset( String str, int pos )
373    {
374        if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
375        {
376            return false;
377        }
378
379        char c = str.charAt( pos );
380
381        return ( ( c > 127 ) || UNICODE_SUBSET[c] );
382    }
383
384
385    /**
386     * Check if the current char is in the unicodeSubset : all chars but
387     * '\0', '(', ')', '*' and '\'
388     *
389     * @param c The char to check
390     * @return True if the current char is in the unicode subset
391     */
392    public static boolean isUnicodeSubset( char c )
393    {
394        return ( ( c > 127 ) || UNICODE_SUBSET[c] );
395    }
396
397
398    /**
399     * Check if the current byte is in the unicodeSubset : all chars but
400     * '\0', '(', ')', '*' and '\'
401     *
402     * @param b The byte to check
403     * @return True if the current byte is in the unicode subset
404     */
405    public static boolean isUnicodeSubset( byte b )
406    {
407        return ( ( b < 0 ) || ( b > 127 ) || UNICODE_SUBSET[b] );
408    }
409
410
411    /**
412     *
413     * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
414     * of every character in the string str. If str is null, the string value 'null' is written with a length of 0
415     * instead of throwing an NullPointerException. Each character in the string s  is converted to a group of one,
416     * two, or three bytes, depending on the value of the character.
417     *
418     * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
419     * written in the length information (four bytes (writeInt)) and the string is split into smaller parts
420     * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
421     * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
422     * once.
423     *
424     * See also {@link java.io.DataOutput#writeUTF(String)}.
425     *
426     * @param objectOutput The objectOutput to write to
427     * @param str The value to write
428     * @throws java.io.IOException If the value can't be written to the file
429     */
430    public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
431    {
432        // Write a 'null' string
433        if ( str == null )
434        {
435            objectOutput.writeInt( 0 );
436            objectOutput.writeUTF( "null" );
437        }
438        else
439        {
440            // Write length of string
441            objectOutput.writeInt( str.length() );
442
443            StringBuffer strBuf = new StringBuffer( str );
444
445            // Write the string in portions not larger than 21845 characters
446            while ( strBuf != null )
447            {
448                if ( strBuf.length() < 21845 )
449                {
450                    objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
451                    strBuf = null;
452                }
453                else
454                {
455                    objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
456                    strBuf.delete( 0, 21845 );
457                }
458            }
459        }
460    }
461
462
463    /**
464     *
465     * Reads in a string that has been encoded using a modified UTF-8  format. The general contract of readUTF  is
466     * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
467     * characters is then returned as a String.
468     *
469     * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
470     * of the readUnsignedShort  method . This integer value is called the UTF length and specifies the number of
471     * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
472     * length of each group is computed from the value of the first byte of the group. The byte following a group, if
473     * any, is the first byte of the next group.
474     *
475     *See also {@link java.io.DataInput#readUTF()}.
476     *
477     * @param objectInput The objectInput to read from
478     * @return The read string
479     * @throws java.io.IOException If the value can't be read
480     */
481    public static String readUTF( ObjectInput objectInput ) throws IOException
482    {
483        StringBuffer strBuf = null;
484
485        // Read length of the string
486        int strLength = objectInput.readInt();
487
488        // Start reading the string
489        strBuf = new StringBuffer( objectInput.readUTF() );
490
491        if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) ) 
492        {
493            // The special case of a 'null' string
494            return null;
495        }
496        else
497        {
498            while ( strLength > strBuf.length() )
499            {
500                strBuf.append( objectInput.readUTF() );
501            }
502            return strBuf.toString();
503        }
504    }
505
506
507    private Unicode()
508    {
509    }
510}