/* "LZW.java" */ import java.io.*; /** The Limpel, Ziv, Welch (LZW) algorithm compresses any sequence of byte data. * The resulting compressed codes are larger than bytes, * but can each represent a group of bytes, * at least some of the time, * for what is usually an overall saving. *
* LZW compression recognizes sequences that have already appeared in the data, * and replaces each later appearance by a unique code. * The degree of compression depends on how much repetition is found in the data. * LZW compression is effective with data having many and / or lengthy * repeat sequences, * such as text, * and graphics with areas of solid color or patterns. *
* The sequence history should allow for the expected amount of repetition, * without wasting too much space in each code on a history index. * This version is fixed at 12 bit codes, * which can refer to nearly a 2^^12 item sequence history, * less some internal overhead. * This size is considered good for short message blocks, * and is simple to implement. *
* Note! * Other versions of LZW may use other sizes of sequence history, * and other coding conventions. * They are not inter-operable with this algorithm. *
* The original papers: *
* There is no need to construct an LZW object, * the LZW.Compress () and LZW.Expand () methods are static. */ public class LZW { /** Note some Java types differ from C types: *
* byte (8), short (16), int (32) are signed, there is no unsigned. * therefore positive (unsigned) values 0-255 need short not byte, * at least while we are actively using them. * OK to --store-- them in byte with casting and masking. *
* char (16) not (8) to support Unicode, * so char is not the same as byte (8). * neither is it the same as short (16). *
* int InputStream.read ()
* Reads the next byte of data from this input stream.
* The value byte is returned as an int in the range 0 to 255.
* If no byte is available because the end of the stream has been reached,
* the value -1 is returned.
* This method blocks until input data is available,
* the end of the stream is detected,
* or an exception is thrown. */
/* SHIFTCOUNT = BITS-8 used for hashing (why this number?)
* MAXVALUE largest below 2^^BITS is reserved as a sentinel.
* TABLESIZE must be prime larger than MAXVALUE for hashing to work.
* With MAXVALUE = 4094 the next prime is 5021 */
private static final int
SHIFTCOUNT = 4,
MAXVALUE = (1 << 12) - 1,
MAXCODE = MAXVALUE - 1,
TABLESIZE = 5021;
/** Compress a sequence of raw data bytes.
* @param inRaw source of data bytes to be compressed,
* e.g. ByteArrayInputStream or BufferedInputStream.
* @param outComp packer for destination for compressed data,
* either to a stream or an array.
* @return number of raw data bytes compressed.
* @exception IOException from either underlying stream.
*/
public static int Compress (InputStream inRaw, CodeOutputPacker outComp)
throws IOException {
int inctr = 0;
/* codes 0-255 represent themselves as possible byte values */
int nextCode = 256;
LZWHistoryItem [] history = new LZWHistoryItem [TABLESIZE];
/* get the first data byte then loop for the rest of them.
* This first data byte will be saved as Prefix,
* and the second data byte as Append,
* for nextCode=256. */
/* The value byte is returned as an int in the range 0 to 255.
* It does --not-- have to be masked. */
int oneCode = inRaw.read ();
inctr++;
int oneByte;
while (-1 != (oneByte = inRaw.read ())) {
inctr++;
/* see if the table already has this prefix//char pair */
int index = FindMatch (history, oneCode, oneByte);
/* yes at this index,
* get the code for it as the --next-- prefix,
* so we can try to append the next byte */
if (history [index] != null)
oneCode = history [index].Code;
/* no,
* output the code for the last data string,
* and start trying to grow onto the new one.
*
* this table index is vacant.
* The hash table always has some extra room,
* because MAXCODE < TABLESIZE.
*
* if not already at MAXCODE,
* add prefix//char pair as nextCode number,
* otherwise we cannot save anymore. */
else {
outComp.putN (oneCode);
if (nextCode <= MAXCODE)
history [index]
= new LZWHistoryItem (nextCode++, oneCode, oneByte);
oneCode = oneByte;
}
}
/* Output last byte directly as a code < 256,
* plus an end of buffer sentinel MAXVALUE code,
* and flush in case we are on odd half. */
outComp.putN (oneCode);
outComp.putN (MAXVALUE);
outComp.flush ();
return inctr;
}
/** Compress a sequence of raw data bytes.
* @param inRaw array of data bytes to be compressed,
* @param outComp packer for destination for compressed data,
* either to a stream or an array.
* @return number of raw data bytes compressed.
* @exception IOException from underlying stream.
*/
public static int Compress (byte [] inRaw, CodeOutputPacker outComp)
throws IOException {
return Compress (inRaw, inRaw.length, outComp);
}
/** Compress a sequence of raw data bytes.
* @param inRaw array of data bytes to be compressed,
* @param validCount how much of inRaw array to be compressed,
* @param outComp packer for destination for compressed data,
* either to a stream or an array.
* @return number of raw data bytes compressed.
* @exception IOException from underlying stream.
*/
public static int Compress (byte [] inRaw, int validCount, CodeOutputPacker outComp)
throws IOException {
int inctr = 0;
/* codes 0-255 represent themselves as possible byte values */
int nextCode = 256;
LZWHistoryItem [] history = new LZWHistoryItem [TABLESIZE];
/* get the first data byte then loop for the rest of them.
* This first data byte will be saved as Prefix,
* and the second data byte as Append,
* for nextCode=256. */
/* The value byte is cosidered signed and gets extended.
* It --does-- have to be masked to be considered 0-255. */
int oneCode = inRaw [inctr++];
oneCode &= 0x00ff;
// System.out.println ("first="+(char)oneCode);
int oneByte;
while ((inctr < inRaw.length) && (inctr < validCount)) {
oneByte = (byte)inRaw [inctr++];
oneByte &= 0x00ff;
// System.out.println ("="+(char)oneByte);
/* see if the table already has this prefix//char pair */
int index = FindMatch (history, oneCode, oneByte);
/* yes at this index,
* get the code for it as the --next-- prefix */
if (history [index] != null)
oneCode = history [index].Code;
/* no,
* output the code for the last data string,
* and start trying to grow onto the new one.
*
* this table index is vacant.
* The hash table always has some extra room,
* because MAXCODE < TABLESIZE.
*
* if not already at MAXCODE,
* add prefix//char pair as nextCode number,
* otherwise we cannot save anymore. */
else {
outComp.putN (oneCode);
if (nextCode <= MAXCODE)
history [index]
= new LZWHistoryItem (nextCode++, oneCode, oneByte);
oneCode = oneByte;
}
}
/* Output last byte directly as a code < 256,
* plus an end of buffer sentinel MAXVALUE code,
* and flush in case we are on odd half. */
outComp.putN (oneCode);
outComp.putN (MAXVALUE);
outComp.flush ();
return inctr;
}
/** Try to find a match for aPrefix//aCharacter,
* return index of the match if found,
* or a vacant slot.
* Always possible to find a vacant slot,
* because MAXCODE < TABLESIZE. */
/////// aPrefix 12-bits could use short
/////// aCharacter only needs 0-255 unsigned
private static int FindMatch (LZWHistoryItem [] history, int aPrefix, int aCharacter) {
int offset;
/* Hash index (somehow) based on both prefix and new char */
int index = (aCharacter << SHIFTCOUNT) ^ aPrefix;
if (index == 0)
offset = 1;
else
offset = TABLESIZE - index;
while (true) {
/* If either a vacant slot,
* or a match (on both parts),
* return the index. */
if ((history [index] == null)
|| ((history [index].Prefix == aPrefix)
&& (history [index].Append == aCharacter)))
return index;
/* Collision, try again...
* Because TABLESIZE is prime we try everything. */
index -= offset;
if (index < 0) index += TABLESIZE;
}
}
/** Expand Compressed codes back to raw data.
* This will fail if input was not LZW compressed data.
* @param inComp unpacker from source.
* @param outExp destination for decoded bytes.
* @return count of decoded raw data bytes.
* @exception IOException from either underlying stream.
* @exception IllegalArgumentException if input was not LZW compressed data.
*/
public static int Expand (CodeInputUnpacker inComp, OutputStream outExp)
throws IOException, IllegalArgumentException {
int outctr = 0;
int nPushed;
LZWByteStack charStack = new LZWByteStack ();
/* Will fill in parallel with Compress table,
* so that each Code has same meaning in both. */
LZWHistoryItem [] history = new LZWHistoryItem [TABLESIZE];
/* codes 0-255 already represent themselves as possible byte values */
int nextCode = 256;
/* get first input code.
* Remember 12 bit codes overlap bytes,
* so every 2 calls will input 3 bytes. */
int oldCode = inComp.getN ();
/* first input code must be a direct byte value 0-255,
* (or we have the wrong input!) */
if (oldCode > 255)
throw new IllegalArgumentException ("Not LZW compressed data");
/* so move it to output buffer.
* Also save it for the table. */
int aChar = oldCode;
outExp.write (aChar);
outctr++;
/* loop while more input,
* final code will be sentinel.
* surely will fail along the way if not LZW compressed data. */
try {
int newCode;
while (MAXVALUE != (newCode = inComp.getN ())) {
/* To check for the special STRING+CHARACTER+STRING+CHARACTER+STRING case,
* if code value larger than yet seen (they increase),
* we know it is not in the table,
* so get expansion of the old code,
* and append this code to it as a character,
* by pushing it first onto the stack. */
if (newCode >= nextCode) {
charStack.push (aChar);
nPushed = 1 + DecodeString (history, charStack, oldCode);
}
/* newCode is already known in the table so expand it,
* or it is an actual byte value < 256 */
else
nPushed = DecodeString (history, charStack, newCode);
/* This top one will go into history table as the nextCode */
outExp.write (aChar = charStack.pop ());
outctr++;
/* The rest (if any more) are output in proper order */
while (1 < nPushed--) {
outExp.write (charStack.pop ());
outctr++;
}
/* Append to history table if we have room,
* otherwise a missed chance for better compression,
* but nothing will be lost,
* because Compressor made the same decision. */
if (nextCode <= MAXCODE) {
history [nextCode] = new LZWHistoryItem (nextCode, oldCode, aChar);
nextCode++;
oldCode = newCode;
}
}
}
catch (IOException eio) { throw eio; }
catch (Exception e) {
e.printStackTrace ();
throw new IllegalArgumentException ("Not LZW compressed data"); }
return outctr;
}
/** Expand Compressed codes back to raw data.
* This will fail if input was not LZW compressed data.
* @param inComp unpacker from source.
* @param outExp destination for decoded bytes.
* @return count of decoded raw data bytes.
* @exception IOException from either underlying stream.
* @exception IllegalArgumentException if input was not LZW compressed data.
*/
public static int Expand (CodeInputUnpacker inComp, byte [] outExp)
throws IOException, IllegalArgumentException {
int outctr = 0;
int nPushed;
LZWByteStack charStack = new LZWByteStack ();
/* Will fill in parallel with Compress table,
* so that each Code has same meaning in both. */
LZWHistoryItem [] history = new LZWHistoryItem [TABLESIZE];
/* codes 0-255 already represent themselves as possible byte values */
int nextCode = 256;
/* get first input code.
* Remember 12 bit codes overlap bytes,
* so every 2 calls will input 3 bytes. */
int oldCode = inComp.getN ();
/* first input code must be a direct byte value 0-255,
* (or we have the wrong input!) */
if (oldCode > 255)
throw new IllegalArgumentException ("Not LZW compressed data");
/* so move it to output buffer.
* Also save it for the table. */
int aChar = oldCode;
outExp [outctr++] = (byte)aChar;
/* loop while more input,
* final code will be sentinel.
* surely will fail along the way if not LZW compressed data. */
try {
int newCode;
while (MAXVALUE != (newCode = inComp.getN ())) {
/* To check for the special STRING+CHARACTER+STRING+CHARACTER+STRING case,
* if code value larger than yet seen (they increase),
* we know it is not in the table,
* so get expansion of the old code,
* and append this code to it as a character,
* by pushing it first onto the stack. */
if (newCode >= nextCode) {
charStack.push (aChar);
nPushed = 1 + DecodeString (history, charStack, oldCode);
}
/* newCode is already known in the table so expand it,
* or it is an actual byte value < 256 */
else
nPushed = DecodeString (history, charStack, newCode);
/* This top one will go into history table as the nextCode */
outExp [outctr++] = (byte)(aChar = charStack.pop ());
/* The rest (if any more) are output in proper order */
while (1 < nPushed--)
outExp [outctr++] = (byte)charStack.pop ();
/* Append to history table if we have room,
* otherwise a missed chance for better compression,
* but nothing will be lost,
* because Compressor made the same decision. */
if (nextCode <= MAXCODE) {
history [nextCode] = new LZWHistoryItem (nextCode, oldCode, aChar);
nextCode++;
oldCode = newCode;
}
}
}
catch (IOException eio) { throw eio; }
catch (Exception e) { throw new IllegalArgumentException ("Not LZW compressed data"); }
return outctr;
}
private static int DecodeString (LZWHistoryItem [] history, LZWByteStack cStack, int code) {
int ii = 1;
/* codes 0-255 stand for themselves as actual byte values,
* higher ones are indices of table entries.
* add their Append values to the cStack,
* running down Prefix chain until an actual byte value */
while (code > 255) {
cStack.push (history [code].Append);
code = history [code].Prefix;
ii++;
}
/* finally add actual byte value.
* We have added (ii) values in reverse order,
* so our caller will have to unstack them. */
cStack.push ((byte)code);
return ii;
}
/** For test and demonstration.
* From a built-in string and from any named files.
* Test files may be text or binary,
* but be prepared for the binary output. */
public static void main (String [] argv) {
int nComp, nExp;
try {
ByteArrayInputStream bais
= new ByteArrayInputStream
(("1 12 123\n"
+"a ab abc abcd abcde abcdef\n"
+"b bc bcd bcde bcdef\n"
+"a ab abc abcd abcde abcdef\n"
+"b bc bcd bcde bcdef\n"
+"c cd cde cdef").getBytes ());
CodeOutputPacker baos = new CodeOutputPacker (5000);
nComp = LZW.Compress (bais, baos);
nExp = LZW.Expand (new CodeInputUnpacker
(baos.toByteArray ()),
System.out);
System.out.println ("\nOK "+nComp+" -> "+nExp);
} catch (Exception ex) { ex.printStackTrace (); }
System.out.println ("\n-------");
try {
byte [] rawbytes
= ("1 12 123\n"
+"a ab abc abcd abcde abcdef\n"
+"b bc bcd bcde bcdef\n"
+"a ab abc abcd abcde abcdef\n"
+"b bc bcd bcde bcdef\n"
+"c cd cde cdef").getBytes ();
CodeOutputPacker baos = new CodeOutputPacker (5000);
nComp = LZW.Compress (rawbytes, baos);
byte [] packedbytes = baos.toByteArray ();
nExp = LZW.Expand (new CodeInputUnpacker (packedbytes),
System.out);
System.out.println ("\nOK "+nComp+" -> "+nExp);
} catch (Exception ex) { ex.printStackTrace (); }
System.out.println ("\n-------");
for (int argc = 0; argc < argv.length; argc++) {
try {
BufferedInputStream bis
= new BufferedInputStream
(new FileInputStream
(new File (argv [argc])));
CodeOutputPacker baos = new CodeOutputPacker (250000);
nComp = LZW.Compress (bis, baos);
nExp = LZW.Expand (new CodeInputUnpacker
(baos.toByteArray ()),
System.out);
System.out.println ("\nOK:"+argv [argc]+" "+nComp+" -> "+nExp);
} catch (Exception ex) { ex.printStackTrace (); }
System.out.println ("\n-------");
}
}
}
/* Table items */
class LZWHistoryItem {
int Code = -1;
int Prefix = 0;
int Append = 0;
public LZWHistoryItem () { }
// Code and Prefix 12-bits could use short
// Append only needs 0-255 unsigned
public LZWHistoryItem (int Code, int Prefix, int Append) {
//System.out.println ("new LZWHistoryItem "+Code+" "+Prefix+"="+(char)Append);
this.Code = Code;
this.Prefix = Prefix;
this.Append = Append;
}
}
/* Note that java.util.stack handles Objects not native types,
* so it is neither convenient nor efficient here. */
class LZWByteStack {
// byte is enough for 0-255 problem is getting unsigned
private int [] ints;
private int ctr = 0;
public LZWByteStack () { this (4000); }
public LZWByteStack (int size) { ints = new int [size]; }
public void push (int bv) { ints [ctr++] = bv; }
public int pop () { return ints [--ctr]; }
}
/* */
///////////////////////////////////////////////////////