omega-rpg/tools/bwt.c

209 lines
5.7 KiB
C

/*
* bwt.c
*
* Code modified from Mark Nelson's article:
* "Data Compression With The Burrows-Wheeler Transform"
* Dr. Dobbs Journal, September 1996.
* See: http://www.dogma.net/markn/articles/bwt/bwt.htm to make sense
* of this source code.
*/
/*
* It is because of limitations of the standard C qsort function that
* we have global data.
* we want to sort indices into the temporaryBuffer, but qsort doesn't provide
* any mechanism of forwarding arbitrary out of band data to the called
* function.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
unsigned char *BWT_temporaryBuffer;
unsigned int BWT_length;
/* The following function compares two substrings of length "length"
* and returns a value less than 0 if the index1 string would be sorted
* first, 0 if they are the same, and a positive number if the index2
* string should be sorted first.
*
* Standard memcmp has an issue that some implementations treat
* characters as signed, and others don't. Since this is a string
* compression algorithm, that should not make a difference to
* compressibility, but it will make the output different on two
* different platforms.
* We are only using this for ASCII data with OMEGA, so this is not
* an issue at present.
*
*/
int BWT_Compare( const void *pointer1, const void *pointer2 )
{
unsigned int index1=*( unsigned int * )pointer1;
unsigned int index2=*( unsigned int * )pointer2;
return memcmp(BWT_temporaryBuffer+index1, BWT_temporaryBuffer+index2, BWT_length);
};
/* The following function does the actual transform. It does this
* transform "in place" in the buffer, and returns an unsigned int
* in the range 0 <= X < length as out of band data for the transform.
* This value needs to be passed to the "detransform".
*
* Large "length" values passed in may take awhile do to the (O)N lg N
* time of sorting (On most platforms).
*/
unsigned int BWT_Transform(unsigned char *buffer, unsigned int length)
{
unsigned int i;
unsigned int first = 0;
unsigned int index;
unsigned int *indices;
BWT_length=length;
BWT_temporaryBuffer=malloc(2*length*sizeof(unsigned char));
if(BWT_temporaryBuffer==NULL)
{
printf( "Failure allocating memory in BWT_transform.\n" );
exit( EXIT_FAILURE );
}
indices=malloc(length*sizeof(unsigned int));
if(indices==NULL)
{
free( BWT_temporaryBuffer );
printf( "Failure allocating memory in BWT_transform.\n" );
exit( EXIT_FAILURE );
}
memcpy(BWT_temporaryBuffer, buffer, length);
memcpy(BWT_temporaryBuffer+length, buffer, length);
for( i=0; i < length; i++)
{
indices[ i ] = i;
}
qsort( indices, length, sizeof( unsigned int ), BWT_Compare );
for( i = 0; i < length; i++ )
{
if( indices[ i ] == 0 )
{
index = length-1;
}
else
{
index = indices[ i ]-1;
if( index == 0 )
{
first=i;
}
}
buffer[i] = BWT_temporaryBuffer[ index ];
}
free( BWT_temporaryBuffer );
free( indices );
return first;
}
void BWT_Detransform( unsigned char *buffer, unsigned int length, unsigned int first )
{
unsigned int i,index;
unsigned char *temporaryBuffer;
unsigned int *characterCountArray, *runningCountArray;
unsigned int *transpositionBuffer;
unsigned int sum=0;
unsigned int previous;
unsigned char character;
runningCountArray = malloc( 256*sizeof( unsigned int ) ); /* Too big to put on stack */
if( runningCountArray == NULL )
{
printf( "Failure allocating memory in BWT_Detransform.\n" );
exit( EXIT_FAILURE );
}
characterCountArray = malloc( 256*sizeof( unsigned int ) ); /* Too big to put on stack */
if( characterCountArray == NULL )
{
free( runningCountArray );
printf( "Failure allocating memory in BWT_Detransform.\n" ); /* Too big to put on stack */
exit( EXIT_FAILURE );
}
temporaryBuffer=malloc( length*sizeof( unsigned char ) );
if( temporaryBuffer == NULL )
{
free( runningCountArray );
free( characterCountArray );
printf( "Failure allocating memory in BWT_transform.\n" ); /* Too big to put on stack */
exit( EXIT_FAILURE );
}
memcpy( temporaryBuffer, buffer, length );
transpositionBuffer = malloc(length*sizeof(unsigned int));
if( transpositionBuffer == NULL )
{
free( runningCountArray );
free( characterCountArray );
free( temporaryBuffer );
}
/* Calculate the running totals for characters fo the alphabet. */
/* First initialize arrays */
for( i = 0 ; i < 256 ; i++ )
{
runningCountArray[ i ] = 0;
characterCountArray[ i ] = 0;
}
for( i = 0; i < length; i++ )
{
runningCountArray[ buffer[ i ] ]++;
}
for( i = 0; i < 256; i++ )
{
previous=runningCountArray[ i ];
runningCountArray[ i ]=sum;
sum = sum + previous;
}
/* Okay, now, for each character we have a map of how many characters are in
* the file that are smaller than it.
*/
/* Given this information (which represents the "sorted" block) */
/* And the information of how many times the character was */
/* found previously in the "buffer" added together we come up */
/* with a unique position on the transposition table */
for ( i = 0 ; i < length ; i++ )
{
character = buffer[ i ];
transpositionBuffer[ characterCountArray[ character ] + runningCountArray[ character ] ] = i;
characterCountArray[ character ]++;
}
/* Transpose by following indices. */
index = first;
for ( i = 0 ; i < length ; i++ )
{
buffer[ i ]=temporaryBuffer[ index ];
index = transpositionBuffer[ index ];
}
free( runningCountArray );
free( characterCountArray );
free( temporaryBuffer );
free( transpositionBuffer );
}