LZW


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<html>
<head>
<title>LZW</title>
<meta name="description" content="LZW Demonstration Implementation in PHP" /> 
<meta name="keywords" content="lzw, php, implementation, demonstration, data compression, compression, james hamilton" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link rel="stylesheet" media="all" href="/style/style.css" type="text/css" />
</head>
<body>
<?php
?>
<?php
    /* utility function to pad out an array of strings
        so that they are all the same length.
        Adds zeros to the begining.
    */
    function pad(&$array) {
        $max = 0;

        foreach($array as $word)
            if(($len = strlen($word)) > $max) $max = $len;

        foreach($array as $key => $word)
            $array[$key] = str_pad($word, $max, "0", STR_PAD_LEFT);

        return $max;
    }


if(isset($_GET[source])) {
    highlight_file(__FILE__);
}else{
?>
<!--<b>Note: It has been pointed out that there is a problem that occurs when encoding strings of consecutive characaters e.g. 1111111111 - I will fix this when I get a chance</b><br /><br />-->
Try out <a href="http://en.wikipedia.org/wiki/LZW" target="_blank">LZW data compression</a> using my PHP implementation below - try encoding the article taken at random from Wikepdia or input your own text for encoding.
<Br /><br />
LZW doesn't work well with short strings. Try encoding 'hello'!
<br /><br />
<?php
if(!function_exists('str_split')){

    function str_split($str, $nr) {

         //Return an array with 1 less item then the one we have
         return array_slice(split("-l-", chunk_split($str, $nr, '-l-')), 0, -1);

    }
}

for($i = 0; $i < 256; $i++) //populate dictionary with ascii characters
    $startDictionary[$i] = chr($i);


function encodeLZW($string) {
    global $startDictionary;

    $dictionary = $startDictionary;

    $word = "";
    for($i = 0; $i < strlen($string); $i++) {
        
        $x = substr($string, $i, 1);
        if(in_array("$word$x", $dictionary, true)) {

            $word = "$word$x";


        }else{
            $encodedString[] = decbin(array_search("$word", $dictionary, true)); //encode to binary string


            $dictionary[] = "$word$x";

            $word = $x;

        }



    }


    $encodedString[]= decbin(array_search($word, $dictionary, true));  //encode to binary string

    return $encodedString;

}

function decodeLZW($string, $bits) {
    global $startDictionary;

    $dictionary = $startDictionary;

    $tokens = str_split($string, $bits); //tokenize the string - split every at every $bits, where $bits is the size of the bits used to encode one symbol.

    $decodedString = $dictionary[bindec($tokens[0])];       //function bindec - decode binary string to a decimal

    $word = $dictionary[bindec($tokens[0])];

    for($i = 1; $i < count($tokens); $i++) {

        $x = bindec($tokens[$i]);
        $element = $dictionary[$x];

        if(!isset($element)) {

            $element = $word . $word{0};

        }

        $decodedString .= $element;

        $dictionary[] = "$word{$element{0}}";
        $word = $element;

    }

    return $decodedString;
}




if(isset($_POST[encode])) {

    //echo stripslashes($_POST[toEncode]); //show the text being encoded
    //echo "<br /><Br />";


    $input = stripslashes($_POST[toEncode]);

    $output = encodeLZW($input);        //do the encoding


    $inputArray = str_split($input, 1);        //split the input into separate chars

    foreach($inputArray as $char) {            //work out each characters binary representation.
        $codeword = decbin(ord($char));

        $input_code .= $codeword;
    }

    //echo chunk_split($input_code, 100, "<br />");      //show the binary code for the input - 100 bits per line

    //echo "<br /><Br />";
    $before = strlen($input_code);                    //length before encoding.
    echo "input is $before bits long - encoded as 8 bit code words";

    //$codewords = explode(",",$output);


    //echo "<br /><br />";


    $bits = pad($output,0);        //pad all the code words with zeros, and return the number of bits used


    foreach($output as $codeword)    //put code together
        $output_code .= $codeword;

    $output_code = str_pad(decbin($bits), 16, "0", STR_PAD_LEFT).$output_code; //add encoded bit size to the first 16 bits of the code



    //echo chunk_split($output_code, 100, "<br />");//show the binary code for the output - 100 bits per line


    echo "<br /><br />";
    $after = strlen($output_code);    //length after encoding.
    echo "output is $after bits long - encoded as $bits bit code words";


    $_POST[toDecode] = $output_code;
    $_POST[toEncode] = "";

    $ratio = $after / $before * 100;


    echo "<br /><Br />";

    echo "compression ratio: $ratio%";

    echo "<br /><Br />";

}else if(isset($_POST[decode])){


    $input = $_POST[toDecode];

    $bits = substr($input, 0, 16);    //remove the first 16 bits
    $input = substr($input, 16);

    $output = decodeLZW($input, bindec($bits)); //decode ($bits is converted to a decimal)

    $_POST[toEncode] = $output;
    $_POST[toDecode] ="";

}






?>

<form method="post" action="<?=$_SERVER[PHP_SELF]?>">

    <textarea rows="10" cols="100" name="toEncode" onFocus="this.select();"><?=!(isset($_POST[encode]) || isset($_POST[decode])) ? "Random Article From Wikipedia: ".file_get_contents("article") : $_POST[toEncode]?></textarea>


    <input type="submit" name="encode" value="Encode" />

</form>
The encoded output will hopefully be shorter than the input!
<br /><br />
The first 16 bits of the output are used to store the number of bits used to encode each codeword.
<Br /><br />
<form method="post" action="<?=$_SERVER[PHP_SELF]?>" onfocus="this.select();">

    <textarea rows="10" cols="100" name="toDecode"><?=$_POST[toDecode]?></textarea>


    <input type="submit" name="decode" value="Decode" />

</form>

<!--<a name="fix"></a>
<div>
<div>Fixed version note</div>
You will notice <a href="lzw1.php">on this old version</a> that if you encode and decode the text it will not be the same. 

After a while of debugging I finally noticed (the algorithm looked perfect!) that the problem was caused by the <a href="http://uk.php.net/function.in_array" target="_blank">php function in_array</a> that I used to determine if word+x is in the dictionary (see LZW algorithm). 
<br /><br />
The following code demonstrates the problem:
<?php 
highlight_string('
$myarray = array(" 1", "a");
echo in_array("1", $myarray); //echo in_array("1", $myarray, true); will work correctly
');
?>
The output is 1 (true) - it seems that in_array trims the string " 1", so that it equates to "1". The fix was simple - use the strict paramater of in_array and it will also check the types. :-)

</div>
-->
<p>
<a href="<?=$_SERVER[PHP_SELF]?>?source">View Page Source</a>
</p>
<?php } ?>
<p>
</p>
<script src="http://www.google-analytics.com/urchin.js" type="text/javascript">
</script>
<script type="text/javascript">
_uacct = "UA-16562388-1";
urchinTracker();
</script>
</body>
</html>