source code: ./othercms/ips_4.3.4/admin/convertutf8/system/Text/Module/Internal.php

Seditio Source
./othercms/ips_4.3.4/admin/convertutf8/system/Text/Module/Internal.php

<?php


/**


 * @brief        Conversion module


 * @author        <a href='https://www.invisioncommunity.com'>Invision Power Services, Inc.</a>


 * @copyright    (c) Invision Power Services, Inc.


 * @license        https://www.invisioncommunity.com/legal/standards/


 * @package        IPS Tools


 * @since        4 Sept 2013


 */





namespace IPSUtf8\Text\Module;





/**


 * Native MB Conversion class


 */


class Internal extends \IPSUtf8\Text\Charset


{


    /**


     * Converts a text string from its current charset to a destination charset using internal conversion class. 


     * The bulk of this function was written by Mikolaj Jedrzejak and used with permission via the License


     *


     * @param    string        Text string


     * @param    string        Text string char set (original)


     * @param    string        Desired character set (destination)


     * @return    @e string


     */


    public function convert( $string, $from, $to='UTF-8' )


    {


        if ( static::needsConverting( $string, $from, $to ) === false )


        {


            /* Make sure it decodes properly - some characters such as £ can cause issues between windows-1250, windows-1252, and iso-8859-1 because they store them differently. @see http://php.net/manual/en/function.utf8-decode.php */


            if ( $to == 'UTF-8' )


            {


                if ( mb_substr_count( utf8_decode( $string ), '?' ) != mb_substr_count( $string, '?' ) )


                {


                    $string = utf8_encode( $string );


                }


            }


            


            return $string;


        }


        


        $text        = '';


        $original    = $string;


        $from        = strtolower( $from );


        $to            = strtolower( $to );


        


        if( !static::$charsetPath )


        {


            static::$charsetPath    = THIS_PATH . '/system/i18n/ConvertTables/';


        }


        


        /**


         * This divison was made to prevent errors during convertion to/from utf-8 with


         * "entities" enabled, because we need to use proper destination(to)/source(from)


         * encoding table to write proper entities.


         * 


         * This is the first case. We are converting from 1byte chars...


         */


        if ($from != "utf-8") 


        { 


            /* Cache the conversion table to prevent rebuilding on every request */


            static $charsetTables = array();





            /**


             * Now build table with both charsets for encoding change. 


             */


            if( !isset( $charsetTables[ $from ] ) )


            {


                if ($to != "utf-8") 


                { 


                    $charsetTables[ $from ] = self::_makeConversionTable( $from, $to );


                }


                else


                {


                    $charsetTables[ $from ] = self::_makeConversionTable( $from );


                }


            }





            $charsetTable = $charsetTables[ $from ];


            


            if( !count($charsetTable) )


            {


                static::$errors[]    = "Character set table not found";


                return $original;


            }





            /**


             * For each char in a string... 


             */


            for ($i = 0; $i < strlen($string); $i++)


            {


                $hexChar        = "";


                $unicodeHexChar    = "";


                $hexChar        = strtoupper(dechex(ord($string[$i])));





                // This is fix from Mario Klingemann, it prevents


                // droping chars below 16 because of missing leading 0 [zeros]


                if ( strlen($hexChar)==1 )


                {


                    $hexChar = "0" . $hexChar;


                }





                // This is quick fix of 10 chars in gsm0338


                // Thanks goes to Andrea Carpani who pointed on this problem and solve it ;)


                if ( ( $from == "gsm0338" ) && ( $hexChar == '1B' ) )


                {


                    $i++;


                    $hexChar .= strtoupper(dechex(ord($string[$i])));


                }





                if ( $to != "utf-8" ) 


                {


                    if ( in_array( $hexChar, $charsetTable[$from] ) )


                    {


                        $unicodeHexChar        = array_search( $hexChar, $charsetTable[$from] );


                        $unicodeHexChars    = explode( "+",$unicodeHexChar );





                        for( $unicodeHexCharElement = 0; $unicodeHexCharElement < count($unicodeHexChars); $unicodeHexCharElement++ )


                        {


                            if ( array_key_exists( $unicodeHexChars[$unicodeHexCharElement], $charsetTable[$to] ) ) 


                            {


                                if ( self::$entities == true ) 


                                {


                                    $text .= self::_unicodeEntity( self::_hexToUtf( $unicodeHexChars[$unicodeHexCharElement] ) );


                                }


                                else


                                {


                                    $text .= chr(hexdec($charsetTable[$to][$unicodeHexChars[$unicodeHexCharElement]]));


                                }


                            }


                             else


                            {


                                static::$errors[]    = "NO_CHAR_IN_DESTINATION: {$string[$i]}";


                                return $original;


                            }


                        }


                    }


                    else


                    {


                        static::$errors[]    = "NO_CHAR_IN_SOURCE: {$string[$i]}";


                        return $original;


                    }


                }


                else


                {


                    if ( in_array( $hexChar, $charsetTable[$from] ) ) 


                    {


                        $unicodeHexChar        = array_search( $hexChar, $charsetTable[$from] );


                        $unicodeHexChars    = explode( "+", $unicodeHexChar );





                        /**


                          * Sometimes there are two or more utf-8 chars per one regular char.


                         * Extream, example is polish old Mazovia encoding, where one char contains


                         * two lettes 007a (z) and 0142 (l slash), we need to figure out how to


                         * solve this problem.


                         * The letters are merge with "plus" sign, there can be more than two chars.


                         * In Mazowia we have 007A+0142, but sometimes it can look like this


                         * 0x007A+0x0142+0x2034 (that string means nothing, it just shows the possibility...)


                          */


                        for( $unicodeHexCharElement = 0; $unicodeHexCharElement < count($unicodeHexChars); $unicodeHexCharElement++)


                        {


                            if ( self::$entities == true ) 


                            {


                                $text .= self::_unicodeEntity( self::_hexToUtf( $unicodeHexChars[$unicodeHexCharElement] ) );


                            }


                            else


                            {


                                $text .= self::_hexToUtf( $unicodeHexChars[$unicodeHexCharElement] );


                            }


                        }                            


                    }


                    else


                    {


                        static::$errors[]    = "NO_CHAR_IN_SOURCE: {$string[$i]}";


                        return $original;


                    }


                }                    


            }


        }


        


        /**


         * This is second case. We are encoding from multibyte char string. 


         */





        else if( $from == "utf-8" )


        {


            $hexChar        = "";


            $unicodeHexChar    = "";


            $charsetTable     = self::_makeConversionTable( $to );





            if( is_array($charsetTable[$to]) AND count($charsetTable[$to]) )


            {


                foreach ( $charsetTable[$to] as $unicodeHexChar => $hexChar )


                {


                    if (self::$entities == true)


                    {


                        $entityOrChar    = self::_unicodeEntity(self::_hexToUtf($unicodeHexChar));


                    }


                    else


                    {


                        $entityOrChar    = chr(hexdec($hexChar));


                    }


    


                    $string = str_replace( self::_hexToUtf( $unicodeHexChar), $entityOrChar, $string );


                }


            }





            $text = $string;


        }


    


        return $text ? $text : $original;


    }


    


    /**


     * Convert unicode characters to unicode HTML entities


     * 


     * @param    string        $unicodeString        Input Unicode string (1 char can take more than 1 byte)


     * @return    @e string


     * @see        _hexToUtf()


     */


    protected static function _unicodeEntity( $unicodeString ) 


    {


        $outString        = "";


        $stringLength    = strlen( $unicodeString );





        for( $charPosition = 0; $charPosition < $stringLength; $charPosition++ ) 


        {


            $char        = $unicodeString [$charPosition];


            $asciiChar    = ord ($char);





            if ($asciiChar < 128) //1 7 0bbbbbbb (127)


            {


               $outString .= $char; 


            }


            else if ($asciiChar >> 5 == 6) //2 11 110bbbbb 10bbbbbb (2047)


            {


               $firstByte    = ($asciiChar & 31);


               $charPosition++;


               $char        = $unicodeString [$charPosition];


               $asciiChar    = ord ($char);


               $secondByte    = ($asciiChar & 63);


               $asciiChar    = ($firstByte * 64) + $secondByte;





               $entity        = sprintf ( "&#%d;", $asciiChar );


               $outString    .= $entity;


            }


            else if ($asciiChar >> 4  == 14)  //3 16 1110bbbb 10bbbbbb 10bbbbbb


            {


                $firstByte    = ($asciiChar & 31);


                $charPosition++;


                $char        = $unicodeString [$charPosition];


                $asciiChar    = ord ($char);


                $secondByte    = ($asciiChar & 63);


                $charPosition++;


                $char        = $unicodeString [$charPosition];


                $asciiChar    = ord ($char);


                $thirdByte    = ($asciiChar & 63);


                $asciiChar    = ((($firstByte * 64) + $secondByte) * 64) + $thirdByte;


                


                $entity        = sprintf ("&#%d;", $asciiChar);


                $outString    .= $entity;


            }


            else if ($asciiChar >> 3 == 30) //4 21 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb


            {


                $firstByte    = ($asciiChar & 31);


                $charPosition++;


                $char        = $unicodeString [$charPosition];


                $asciiChar    = ord ($char);


                $secondByte    = ($asciiChar & 63);


                $charPosition++;


                $char        = $unicodeString [$charPosition];


                $asciiChar    = ord ($char);


                $thirdByte    = ($asciiChar & 63);


                $charPosition++;


                $char        = $unicodeString [$charPosition];


                $asciiChar    = ord ($char);


                $fourthByte    = ($asciiChar & 63);


                $asciiChar    = ((((($firstByte * 64) + $secondByte) * 64) + $thirdByte) * 64) + $fourthByte;


            


                $entity        = sprintf ("&#%d;", $asciiChar);


                $outString    .= $entity;


            }


          }





        return $outString;


    } 


    


    /**


     * Convert unicode characters to unicode HTML entities. 


     * It is very similar to _unicodeEntity function (link below). There is one difference 


     * in returned format. This time it's a regular char(s), in most cases it will be one or two chars. 


     * 


     * @param    string        $utfCharInHex    Hexadecimal value of a unicode char.


     * @return    @e string


     * @see        _unicodeEntity()


     */


    protected static function _hexToUtf( $utfCharInHex )


    {


        $outputChar        = "";


        $utfCharInDec    = hexdec($utfCharInHex);





        if( $utfCharInDec < 128 )


        {


            $outputChar .= chr($utfCharInDec);


        }


        else if( $utfCharInDec < 2048 )


        {


            $outputChar .= chr( ( $utfCharInDec>>6 ) + 192 ) . chr( ( $utfCharInDec&63 ) + 128 );


        }


        else if( $utfCharInDec < 65536 )


        {


            $outputChar .= chr( ( $utfCharInDec>>12 ) + 224 ) . chr( ( ( $utfCharInDec>>6 ) &63 ) + 128 ) . chr( ( $utfCharInDec&63 ) + 128 );


        }


        else if( $utfCharInDec < 2097152 )


        {


            $outputChar .= chr( $utfCharInDec>>18+240 ) . chr( ( ( $utfCharInDec>>12 ) &63 ) + 128 ) . chr( ( $utfCharInDec>>6 ) &63 + 128 ) . chr( $utfCharInDec&63 + 128 );


        }





        return $outputChar;


    }








    /**


     * Create the conversion tables to use.


     * 


     * This function creates table with two SBCS (Single Byte Character Set). Every conversion is through this table.


     *  


     * - The file with encoding tables have to be save in "Format A" of unicode.org charset table format.


     * - BOTH charsets MUST be SBCS


     * - The files with encoding tables have to be complete (None of chars can be missing, unles you are sure you are not going to use it)


     * 


     * "Format A" encoding file, if you have to build it by yourself should aplly these rules:


     * - you can comment everything with #


     * - first column contains 1 byte chars in hex starting from 0x..


     * - second column contains unicode equivalent in hex starting from 0x....


     * - then every next column is optional, but in "Format A" it should contain unicode char name or/and your own comment


     * - the columns can be splited by "spaces", "tabs", "," or any combination of these


     * - below is an example


     * 


     * <code>


     * #


     * #    The entries are in ANSI X3.4 order.


     * #


     * 0x00    0x0000    #    NULL end extra comment, if needed


     * 0x01    0x0001    #    START OF HEADING


     * # Oh, one more thing, you can make comments inside of a rows if you like.


     * 0x02    0x0002    #    START OF TEXT


     * 0x03    0x0003    #    END OF TEXT


     * next line, and so on...


     * </code>


     * 


     * You can get full tables with encodings from http://www.unicode.org


     * 


     * @param    string        $firstEncoding        Name of first encoding and first encoding filename (thay have to be the same)


     * @param    string        $secondEncoding        Name of second encoding and second encoding filename (thay have to be the same). Optional for building a joined table.


     * @return    @e array


     */


    protected static function _makeConversionTable( $firstEncoding, $secondEncoding = "" ) 


    {


        $convertTable    = array();





        for( $i = 0; $i < func_num_args(); $i++ )


        {


            /**


             * Because func_*** can't be used inside of another function call


             * we have to save it as a separate value.


             */


            $fileName = func_get_arg($i);





            if ( !is_file( static::$charsetPath . $fileName ) ) 


            {


                static::$errors[]    = "FILE_NOT_FOUND: {$fileName}";


                return;


            }





            $fileWithEncTabe    = fopen( static::$charsetPath . $fileName, "r" ) or die(); //This die(); is just to make sure...





            while(!feof($fileWithEncTabe))


            {


                /**


                 * We asume that line is not longer


                 * than 1024 which is the default value for fgets function 


                 */


                if( $oneLine = trim( fgets( $fileWithEncTabe, 1024 ) ) )


                {


                    /**


                     * We don't need all comment lines. I check only for "#" sign, because


                     * this is a way of making comments by unicode.org in thair encoding files


                     * and that's where the files are from :-)


                     */


                    if ( substr( $oneLine, 0, 1 ) != "#" ) 


                    {


                        /**


                         * Sometimes inside the charset file the hex walues are separated by


                         * "space" and sometimes by "tab", the below preg_split can also be used


                         * to split files where separator is a ",", "\r", "\n" and "\f"


                         */


                        $hexValue = preg_split ( "/[\s,]+/", $oneLine, 3 );  //We need only first 2 values





                        /**


                         * Sometimes char is UNDEFINED, or missing so we can't use it for convertion


                         */


                        if (substr($hexValue[1], 0, 1) != "#") 


                        {


                            $arrayKey    = strtoupper(str_replace(strtolower("0x"), "", $hexValue[1]));


                            $arrayValue    = strtoupper(str_replace(strtolower("0x"), "", $hexValue[0]));





                            $convertTable[ func_get_arg($i) ][ $arrayKey ] = $arrayValue;


                        }


                    }


                }


            }


        }





        return $convertTable;


    }


}