#include #include #include #include // Total length of the input file. unsigned int nsyms = 0; // Unigram and digram tables. Counts occurrences of // characters, singly and in pairs. unsigned int unitab[256], ditab[256][256]; // Table of unigram probabilities. For character c, // unigram[c] is the probability of seeing c. double unigram[256]; // Table of digram probabilities. For characters c,d, // digram[c][d] is the probability of seeing character d // immediately after seeing character c. double digram[256][256]; // Stationary distribution to be computed. double distrib[256],newdist[256]; void printsym(unsigned char c){ unsigned char cc = ((((c%128)<32)||(c==127))?' ':c); printf("%02x[%c]",c,cc); } void print_digrams() { int col = 0; printf("Nonzero digram probabilities:\n"); for(int c=0;c<256;c++){ for(int d=0;d<256;d++){ if (ditab[c][d]!=0) { printsym(c); printf("->"); printsym(d); printf(": %8.6f ", digram[c][d]); if (++col==3) { printf("\n"); col=0; } } } } printf("\n"); } void equilibrate() { // Next, iterate through the chain for one step. int iterations = 0; double maxdelta = 0.0; double epsilon = 1e-15; do { iterations++; /*printf("Distribution at start of iteration %d:\n",iterations); for(int c=0;c<256;c++){ unsigned char cc = ((((c%128)<32)||(c==127))?' ':c); if (distrib[c] != 0.0) printf("%3u[%c]:%8.6f\n", c, cc, distrib[c]); }*/ for(int c=0;c<256;c++) newdist[c] = 0.0; // Initialize new distribution. for(int c=0;c<256;c++){ for(int d=0;d<256;d++){ newdist[d] += distrib[c]*digram[c][d]; } } // Is the new dist within epsilon of the old one? maxdelta = 0.0; for(int c=0;c<256;c++){ double delta = fabs((double)newdist[c]-distrib[c]); if (delta > maxdelta) maxdelta = delta; } if (maxdelta == 0.0) goto done; if (iterations < 100 || iterations%100 == 0) printf("After iteration %d max delta is %g.\n", iterations, maxdelta); for(int c=0;c<256;c++) distrib[c] = newdist[c]; } while (maxdelta > epsilon); done: printf("Finished calculating stationary distribution in %d iterations; max delta is %g.\n",iterations, maxdelta); for(int c=0;c<256;c++) distrib[c] = newdist[c]; printf("Stationary distribution:\n"); for(int c=0;c<256;c++){ printsym(c); printf(": %8.6f ", distrib[c]); if ((c+1)%4 == 0) printf("\n"); } } static unsigned char bytebuf = 0x00; static int bitno = 7; // write most-significant bit first (big-endian) static int bits_written = 0; static int bytes_written = 0; void writebit(FILE *file, int bit) { if (bit) bytebuf |= 1< "); printsym(d); printf(": %-9u", ditab[c][d]); col++; if(col==3) { printf("\n"); col=0; } } } } printf("\n"); // Now, we need to calculate the unigram and digram probability distributions. // Here, the digram probabilities are conditional probabilities, p(d|c). // they are the probabilities that after symbol c, you will see d next. for(int c=0; c<256; c++){ unigram[c] = unitab[c]/(double)nsyms; for(int d=0; d<256; d++){ digram[c][d] = (unitab[c])?(ditab[c][d]/(double)unitab[c]):0; } } printf("Unigram probabilities:\n"); for(int c=0;c<256;c++){ unsigned char cc = ((((c%128)<32)||(c==127))?' ':c); printf("%3u[%c]:%8.6f | ", c, cc, unigram[c]); if ((c+1)%4 == 0) printf("\n"); } print_digrams(); // Compute the stationary distribution over the symbols according to the Markov chain. // First, initialize the dist to the unigram probs. for(int c=0;c<256;c++) distrib[c] = unigram[c]; equilibrate(); // Calculate entropy of stationary distribution double entropy = 0.0; for(int c=0;c<256;c++){ unsigned char cc = ((((c%128)<32)||(c==127))?' ':c); if (distrib[c]!=0.0) printf("%3u[%c]: p=%f, i=%f, K=%f, s=%f\n", c, cc, distrib[c], 1/distrib[c], log((double)1/distrib[c]), distrib[c]*log((double)1/distrib[c])); if (distrib[c]!=0.0) entropy -= distrib[c]*log((double)distrib[c]); } entropy /= log((double)2); printf("Entropy of stationary distribution is %f bits.\n", entropy); printf("Predicted compression factor is %f times smaller\n(%f%% as big), or %d --> %d bytes.\n", 8/entropy, 100*entropy/8, nsyms, (int)(nsyms*entropy/8)); // Open & write auxilliary output file (this will contain the digram probs) char *auxname = (char *)malloc(strlen(filename)+strlen(".aux")+1); sprintf(auxname,"%s.aux",filename); printf("Writing to output file %s...\n", auxname); FILE *auxfile = fopen(auxname,"w"); fwrite(digram,sizeof(double),256*256,auxfile); fclose(auxfile); // Second pass: Compress the file with the roulette algorithm printf("Reopening %s...\n", filename); infile = fopen(filename,"r"); // Open output file char *outname = (char *)malloc(strlen(filename)+strlen(".rou")+1); sprintf(outname,"%s.rou",filename); printf("Writing to output file %s...\n", outname); FILE *outfile = fopen(outname,"w"); // Begin roulette wheel algorithm. double lo=0.0, hi=1.0; // Initialize roulette wheel range to encompass entire code space (all possible sequences) // Handle first input symbol. Assume it's chosen randomly from the stationary distribution. sym = fgetc(infile); oldsym = 0; printf("First symbol is "); printsym(sym); printf("\n"); // Bump up low end of range to skip regions for earlier symbols... for(int c=0;c= .5) { if (hi < .5) { printf("\tWriting a 0.\n"); writebit(outfile, 0); lo *= 2; hi *= 2; // Map [0,.5] -> [0,1] } else { printf("\tWriting a 1.\n"); writebit(outfile, 1); lo = lo*2-1; hi = hi*2-1; // Map [.5,1] -> [0,1] } printf("\t\tRange expanded to [%f,%f].\n", lo, hi); } // Loop through rest of input file. while (!feof(infile)) { oldsym = sym; sym = fgetc(infile); printf("Next symbol is "); printsym(sym); printf("\n"); // What part of present range is the new symbol in? double partlo = 0.0, parthi; for(int c=0; c= .5) { if (hi < .5) { printf("\tWriting a 0.\n"); writebit(outfile, 0); lo *= 2; hi *= 2; // Map [0,.5] -> [0,1] } else { printf("\tWriting a 1.\n"); writebit(outfile, 1); lo = lo*2-1; hi = hi*2-1; // Map [.5,1] -> [0,1] } printf("\t\tRange expanded [%f,%f].\n", lo, hi); } } flushbits(outfile); // Finish writing final byte printf("A total of %d bits (%d bytes) were written to the output file.\n", bits_written, bytes_written); fclose(outfile); fclose(infile); // Now comes the hard(er) part... Decompression. First, we have // to read in the digram table. Then we have to reconstruct the // equilibrium distribution from it. (This is only needed for // the first letter!) Then start reading bits and narrowing // down where we are in the equil. distr. Then once we've // reconstructed the first symbol, we switch over to using the // distributions from the digram table. The only question is, // does roundoff error corrupt the results at some point? // We shall see... // Clear the digram table, to make it clear we're not cheating. for(int c=0;c<256;c++) for(int d=0;d<256;d++) digram[c][d] = 0.0; // Read the digram table from the 256 KB aux file. Clearly, // in practice this algorithm would only be efficient for // compressing data streams that are much longer than 256 KB. // We could do better by saving just the nonzero entries. // Open & read auxilliary output file (this will contain the digram probs) printf("Reading digram table from auxilliary file %s...\n",auxname); auxfile = fopen(auxname,"r"); fread(digram,sizeof(double),256*256,auxfile); fclose(auxfile); // Print the digram table so we can see if it matches the original. print_digrams(); // Clear the stationary distribution table, to make it clear we're not cheating here either. for(int c=0;c<256;c++) distrib[c] = 0.0; // First, search the digram table for a nonzero entry, any entry! unsigned char anysym; for(int c=0;c<256;c++) { for(int d=0;d<256;d++){ if (digram[c][d] != 0.0) { anysym = d; goto foundit; } } } foundit: // Now, put a "1.0" probability in that symbol in the distrib... distrib[anysym] = 1.0; // Now, begin smearing out the probability mass, same algorithm as before... // It should still work, but will just take a bit longer than when we started // from the unigram distribution... equilibrate(); // Now initialize the range for roulette algorithm. lo=0.0; hi=1.0; // Initialize roulette wheel range to encompass entire code space (all possible sequences) // Now open and begin reading the output file. printf("Reading roulette-compressed data from file %s...\n",outname); outfile = fopen(outname,"r"); // Open and begin writing the reconstructed file. char *recname = (char *)malloc(strlen(filename)+strlen(".rec")+1); sprintf(recname,"%s.rec",filename); printf("Writing to reconstructed input file %s...\n", recname); FILE *recfile = fopen(recname,"w"); int bit; readloop: bit = readbit(outfile); if (bit == 0) { hi = lo + (hi-lo)*.5; } else { lo = hi - (hi-lo)*.5; } printf("Saw a %d bit, range is now [%f,%f].\n", bit, lo, hi); // Now, we have to sift through the equilibrium distribution to see if this // puts us inside any of the symbols yet. double cumprob = 0.0; // cumulative probability up to the current point for(int sym=0; sym<256; sym++) { if (lo >= cumprob && hi < cumprob + distrib[sym]) { // We have a winnah! Output this symbol and continue printf("The initial symbol is "); printsym(sym); printf(". Hooray!\n"); fputc(sym,recfile); // Stretch the current range in accordance with mapping the symbol's range out to [0,1] lo -= cumprob; hi -= cumprob; lo /= distrib[sym]; hi /= distrib[sym]; printf("\tStretched the range out to [%f,%f].\n", lo, hi); oldsym = sym; goto do_rest; // The other symbols will use the digram probs } // Accumulate the probability of the symbol we just considered... cumprob += distrib[sym]; if (cumprob > hi) break; //short-circuit exit } // OK, we don't have a complete symbol yet, have to keep reading bits. goto readloop; do_rest: if (feof(outfile)) goto done; // No more bits to read? bit = readbit(outfile); if (bit == 0) { hi = lo + (hi-lo)*.5; } else { lo = hi - (hi-lo)*.5; } printf("Saw a %d bit, range is now [%f,%f].\n", bit, lo, hi); // Now, we have to sift through the equilibrium distribution to see if this // puts us inside any of the symbols yet. cumprob = 0.0; // cumulative probability up to the current point for(int sym=0; sym<256; sym++) { if (lo >= cumprob && hi < cumprob + digram[oldsym][sym]) { // We have a winnah! Output this symbol and continue printf("The next symbol is "); printsym(sym); printf(". Whoopee!\n"); fputc(sym,recfile); // Stretch the current range in accordance with mapping the symbol's range out to [0,1] lo -= cumprob; hi -= cumprob; lo /= digram[oldsym][sym]; hi /= digram[oldsym][sym]; printf("\tStretched the range out to [%f,%f].\n", lo, hi); oldsym = sym; goto do_rest; } // Accumulate the probability of the symbol we just considered... cumprob += digram[oldsym][sym]; if (cumprob > hi) break; //short-circuit exit } // OK, we don't have a complete symbol yet, have to keep reading bits. goto do_rest; done: // FINISH DECODING HERE fclose(outfile); fclose(recfile); free(outname); free(auxname); free(recname); // strings malloc'ed earlier exit(0); }