/*-----------------------------------------------------------*/
/*---                                      bzip2indexer.c ---*/
/*-----------------------------------------------------------*/

/* ------------------------------------------------------------------
   Based on bzip2recover.c:
   This file is part of bzip2/libbzip2, a program and library for
   lossless, block-sorting data compression.

   bzip2/libbzip2 version 1.0.5 of 10 December 2007
   Copyright (C) 1996-2007 Julian Seward <jseward@bzip.org>

   Please read the WARNING, DISCLAIMER and PATENTS sections in the 
   README file.

   This program is released under the terms of the license contained
   in the file LICENSE.
   ------------------------------------------------------------------ */

/* This program is a complete hack and should be rewritten properly.
	 It isn't very complicated. */

#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <bzlib.h>
#include "bzlib_private.h"

/* This program records bit locations in the file to be recovered.
   That means that if 64-bit ints are not supported, we will not
   be able to recover .bz2 files over 512MB (2^32 bits) long.
   On GNU supported platforms, we take advantage of the 64-bit
   int support to circumvent this problem.  Ditto MSVC.

   This change occurred in version 1.0.2; all prior versions have
   the 512MB limitation.
*/
#ifdef __GNUC__
   typedef  unsigned long long int  MaybeUInt64;
#  define MaybeUInt64_FMT "%Lu"
#else
#ifdef _MSC_VER
   typedef  unsigned __int64  MaybeUInt64;
#  define MaybeUInt64_FMT "%I64u"
#else
   typedef  unsigned int   MaybeUInt64;
#  define MaybeUInt64_FMT "%u"
#endif
#endif

#define True    ((Bool)1)
#define False   ((Bool)0)


#define BZ_MAX_FILENAME 2000

Char inFileName[BZ_MAX_FILENAME];
Char outFileName[BZ_MAX_FILENAME];
Char progName[BZ_MAX_FILENAME];

MaybeUInt64 bytesOut = 0;
MaybeUInt64 bytesIn  = 0;


/*---------------------------------------------------*/
/*--- Header bytes                                ---*/
/*---------------------------------------------------*/

#define BZ_HDR_B 0x42                         /* 'B' */
#define BZ_HDR_Z 0x5a                         /* 'Z' */
#define BZ_HDR_h 0x68                         /* 'h' */
#define BZ_HDR_0 0x30                         /* '0' */
 

/*---------------------------------------------------*/
/*--- I/O errors                                  ---*/
/*---------------------------------------------------*/

/*---------------------------------------------*/
static void readError ( void )
{
   fprintf ( stderr,
             "%s: I/O error reading `%s', possible reason follows.\n",
            progName, inFileName );
   perror ( progName );
   fprintf ( stderr, "%s: warning: output file(s) may be incomplete.\n",
             progName );
   exit ( 1 );
}


/*---------------------------------------------*/
static void writeError ( void )
{
   fprintf ( stderr,
             "%s: I/O error reading `%s', possible reason follows.\n",
            progName, inFileName );
   perror ( progName );
   fprintf ( stderr, "%s: warning: output file(s) may be incomplete.\n",
             progName );
   exit ( 1 );
}


/*---------------------------------------------*/
static void mallocFail ( Int32 n )
{
   fprintf ( stderr,
             "%s: malloc failed on request for %d bytes.\n",
            progName, n );
   fprintf ( stderr, "%s: warning: output file(s) may be incomplete.\n",
             progName );
   exit ( 1 );
}


/*---------------------------------------------*/
static void tooManyBlocks ( Int32 max_handled_blocks )
{
   fprintf ( stderr,
             "%s: `%s' appears to contain more than %d blocks\n",
            progName, inFileName, max_handled_blocks );
   fprintf ( stderr,
             "%s: and cannot be handled.  To fix, increase\n",
             progName );
   fprintf ( stderr, 
             "%s: BZ_MAX_HANDLED_BLOCKS in bzip2recover.c, and recompile.\n",
             progName );
   exit ( 1 );
}



/*---------------------------------------------------*/
/*--- Bit stream I/O                              ---*/
/*---------------------------------------------------*/

typedef
   struct {
      FILE*  handle;
      Int32  buffer;
      Int32  buffLive;
      Char   mode;
   }
   BitStream;


/*---------------------------------------------*/
static BitStream* bsOpenReadStream ( FILE* stream )
{
   BitStream *bs = malloc ( sizeof(BitStream) );
   if (bs == NULL) mallocFail ( sizeof(BitStream) );
   bs->handle = stream;
   bs->buffer = 0;
   bs->buffLive = 0;
   bs->mode = 'r';
   return bs;
}


/*---------------------------------------------*/
/*--
   Returns 0 or 1, or 2 to indicate EOF.
--*/
static Int32 bsGetBit ( BitStream* bs )
{
   if (bs->buffLive > 0) {
      bs->buffLive --;
      return ( ((bs->buffer) >> (bs->buffLive)) & 0x1 );
   } else {
      Int32 retVal = getc ( bs->handle );
      if ( retVal == EOF ) {
         if (errno != 0) readError();
         return 2;
      }
      bs->buffLive = 7;
      bs->buffer = retVal;
      return ( ((bs->buffer) >> 7) & 0x1 );
   }
}


/*---------------------------------------------*/
static void bsClose ( BitStream* bs )
{
   Int32 retVal;

   if ( bs->mode == 'w' ) {
      while ( bs->buffLive < 8 ) {
         bs->buffLive++;
         bs->buffer <<= 1;
      };
      retVal = putc ( (UChar) (bs->buffer), bs->handle );
      if (retVal == EOF) writeError();
      bytesOut++;
      retVal = fflush ( bs->handle );
      if (retVal == EOF) writeError();
   }
   retVal = fclose ( bs->handle );
   if (retVal == EOF) {
      if (bs->mode == 'w') writeError(); else readError();
   }
   free ( bs );
}


/*---------------------------------------------*/
// static void bsPutUChar ( BitStream* bs, UChar c )
// {
//    Int32 i;
//    for (i = 7; i >= 0; i--)
//       bsPutBit ( bs, (((UInt32) c) >> i) & 0x1 );
// }


/*---------------------------------------------*/
// static void bsPutUInt32 ( BitStream* bs, UInt32 c )
// {
//    Int32 i;
// 
//    for (i = 31; i >= 0; i--)
//       bsPutBit ( bs, (c >> i) & 0x1 );
// }


/*---------------------------------------------*/
static Bool endsInBz2 ( Char* name )
{
   Int32 n = strlen ( name );
   if (n <= 4) return False;
   return
      (name[n-4] == '.' &&
       name[n-3] == 'b' &&
       name[n-2] == 'z' &&
       name[n-1] == '2');
}

/***********************************************/

struct bz_stream_seekable {
   unsigned char head[4];
   bz_stream stream;
   FILE *f;
};

/** Init Bz2 Stream */
int init_bs_stream(struct bz_stream_seekable *s, FILE *f)
{
   int err;
   s->f = f;

   fseek(s->f, 0, SEEK_SET);
   fread(s->head, 4, 1, s->f);

  return 1;
}

/** seek to a BZ2 block */
int bs_block_seek(struct bz_stream_seekable *s, MaybeUInt64 pos)
{
  long here = pos/8;
  int err;
  int ret = fseek(s->f, here, SEEK_SET);
  int remaining_bit = 0;

   memset(&s->stream, 0, sizeof(bz_stream));
   s->stream.next_in  = 0;
   s->stream.avail_in = 0;
   
   BZ2_bzDecompressInit(&s->stream, 0,0);

   s->stream.next_in  = s->head;
   s->stream.avail_in = 4;

   // decompress header:
    err = BZ2_bzDecompress (&s->stream);

    if(err != BZ_OK)
    {
    fprintf(stderr, "head_err: %d\n", err);
    }

   remaining_bit = pos & 0x7;
     
     if(remaining_bit != 0)
     {
       unsigned int c = fgetc(s->f);
       unsigned int bit = 8 - remaining_bit;
       DState *ds = (DState *) s->stream.state;
//        printf("%u %u\n", s->bsLive, s->bsBuff);
       
       ds->bsBuff = (ds->bsBuff<<bit) |  c;
       ds->bsLive = bit;
//        printf("%u bit readed\n", bit);
     }

  return ret;
}



/*---------------------------------------------------*/
/*---                                             ---*/
/*---------------------------------------------------*/

/* This logic isn't really right when it comes to Cygwin. */
#ifdef _WIN32
#  define  BZ_SPLIT_SYM  '\\'  /* path splitter on Windows platform */
#else
#  define  BZ_SPLIT_SYM  '/'   /* path splitter on Unix platform */
#endif

#define BLOCK_HEADER_HI  0x00003141UL
#define BLOCK_HEADER_LO  0x59265359UL

#define BLOCK_ENDMARK_HI 0x00001772UL
#define BLOCK_ENDMARK_LO 0x45385090UL

/* Increase if necessary.  However, a .bz2 file with > 50000 blocks
   would have an uncompressed size of at least 40GB, so the chances
   are low you'll need to up this.
*/
#define BZ_MAX_HANDLED_BLOCKS 50000

MaybeUInt64 bStart [BZ_MAX_HANDLED_BLOCKS];
MaybeUInt64 bEnd   [BZ_MAX_HANDLED_BLOCKS];
MaybeUInt64 rbStart[BZ_MAX_HANDLED_BLOCKS];
MaybeUInt64 rbEnd  [BZ_MAX_HANDLED_BLOCKS];

void seek_and_decode_test( FILE*       inFile, Int32 rbCtr)
{
   /***** SEEKEING AND DECODING ****************/
   struct bz_stream_seekable stream;
   int i;
   int err;
   // initialize seekable bzstream
   init_bs_stream(&stream, inFile);
   
   // for any block:
   for(i=0;i<rbCtr;++i)
   {
     
     char *inbuffer = malloc(65536);
     char *outbuffer = malloc(4*65536);

//      printf("Block %u: from "MaybeUInt64_FMT " to " MaybeUInt64_FMT"\n", i, rbStart[i],  rbEnd[i]);
//   48 BIT HEADER:
     bs_block_seek(&stream, rbStart[i] - (MaybeUInt64)48);

     stream.stream.next_out = outbuffer;
     stream.stream.avail_out = 4*65536;
     stream.stream.total_out_lo32 = 0;
        
      while(stream.stream.avail_out>0)
      {
      if(stream.stream.avail_in==0)
                {
                stream.stream.next_in = inbuffer;
                if((stream.stream.avail_in = fread(stream.stream.next_in, 1, 65536, inFile))==0)
		  {
		    fprintf(stderr, "EOF\n");
                        break;  /* EOF */
		  }
                }
     
      err = BZ2_bzDecompress (&stream.stream);

      if(err != BZ_OK)
	{
	fprintf(stderr, "err: %d\n", err);
	}
      
      if(err==BZ_STREAM_END)
	{
	      break;
	}
      }
      
     BZ2_bzDecompressEnd(&stream.stream);
     
     free(inbuffer);
     free(outbuffer);
     
   }
}

Int32 main ( Int32 argc, Char** argv )
{
   FILE*       inFile;
   FILE*       outFile;
   BitStream*  bsIn; /* , *bsWr; */
   Int32       b, wrBlock, currBlock, rbCtr;
   MaybeUInt64 bitsRead;
   
   UInt32      buffHi, buffLo, blockCRC;
   Char*       p;
   
   strcpy ( progName, argv[0] );
   inFileName[0] = outFileName[0] = 0;

   fprintf ( stderr, 
             "bzip2indexer 1.0.5: extracts blocks boundaries in .bz2 files.\n" );

   if (argc != 2) {
      fprintf ( stderr, "%s: usage is `%s damaged_file_name'.\n",
                        progName, progName );
      switch (sizeof(MaybeUInt64)) {
         case 8:
            fprintf(stderr, 
                    "\trestrictions on size of recovered file: None\n");
            break;
         case 4:
            fprintf(stderr, 
                    "\trestrictions on size of recovered file: 512 MB\n");
            fprintf(stderr, 
                    "\tto circumvent, recompile with MaybeUInt64 as an\n"
                    "\tunsigned 64-bit int.\n");
            break;
         default:
            fprintf(stderr, 
                    "\tsizeof(MaybeUInt64) is not 4 or 8 -- "
                    "configuration error.\n");
            break;
      }
      exit(1);
   }

   if (strlen(argv[1]) >= BZ_MAX_FILENAME-20) {
      fprintf ( stderr, 
                "%s: supplied filename is suspiciously (>= %d chars) long.  Bye!\n",
                progName, (int)strlen(argv[1]) );
      exit(1);
   }

   strcpy ( inFileName, argv[1] );

   inFile = fopen ( inFileName, "rb" );
   if (inFile == NULL) {
      fprintf ( stderr, "%s: can't read `%s'\n", progName, inFileName );
      exit(1);
   }

   bsIn = bsOpenReadStream ( inFile );
//    fprintf ( stderr, "%s: searching for block boundaries ...\n", progName );

   bitsRead = 0;
   buffHi = buffLo = 0;
   currBlock = 0;
   bStart[currBlock] = 0;

   rbCtr = 0;

   while (True) {
   // while(rbCtr<2) {
     // read a single bit (0,1). 2 is EOF
      b = bsGetBit ( bsIn );
      bitsRead++;
      // EOF?
      if (b == 2) {
         if (bitsRead >= bStart[currBlock] &&
            (bitsRead - bStart[currBlock]) >= 40) {
            bEnd[currBlock] = bitsRead-1;
            if (currBlock > 0)
               fprintf ( stderr, "Warning: block %d runs from " MaybeUInt64_FMT 
                                 " to " MaybeUInt64_FMT " (incomplete)\n",
                         currBlock,  bStart[currBlock], bEnd[currBlock] );
         } else
            currBlock--;
         break;
      }
      // shift buffHi,buffLo
      buffHi = (buffHi << 1) | (buffLo >> 31);
      buffLo = (buffLo << 1) | (b & 1);
      if ( ( (buffHi & 0x0000ffff) == BLOCK_HEADER_HI 
             && buffLo == BLOCK_HEADER_LO)
           || 
           ( (buffHi & 0x0000ffff) == BLOCK_ENDMARK_HI 
             && buffLo == BLOCK_ENDMARK_LO)
         ) {
         if (bitsRead > 49) {
            bEnd[currBlock] = bitsRead-49;
         } else {
            bEnd[currBlock] = 0;
         }
         if (currBlock > 0 &&
	     (bEnd[currBlock] - bStart[currBlock]) >= 130) {
            fprintf ( stdout, " " MaybeUInt64_FMT " " MaybeUInt64_FMT "\n",
                      bStart[currBlock], bEnd[currBlock]);
            rbStart[rbCtr] = bStart[currBlock];
            rbEnd[rbCtr] = bEnd[currBlock];
            rbCtr++;
         }
         if (currBlock >= BZ_MAX_HANDLED_BLOCKS)
            tooManyBlocks(BZ_MAX_HANDLED_BLOCKS);
         currBlock++;

         bStart[currBlock] = bitsRead;
      }
   }

//   bsClose ( bsIn );
   

 // Test Seeking: 
   seek_and_decode_test(inFile, rbCtr);
   
   return 0;
}


/*
00000000  42 5a 68 39 31 41 59 26  53 59 e3 e4 52 0f 01 cc  |BZh91AY&SY..R...|
00000010  18 5f ff 74 10 7f ff ff  ff bf ff ff fa bf ff ff  |._.t............|

*/

/*-----------------------------------------------------------*/
/*--- end                                  bzip2indexer.c ---*/
/*-----------------------------------------------------------*/