/*-----------------------------------------------------------*/ /*--- bzip2indexer.c ---*/ /*-----------------------------------------------------------*/ /* ------------------------------------------------------------------ Based on bzip2recover.c: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.5 of 10 December 2007 Copyright (C) 1996-2007 Julian Seward <jseward@bzip.org> Please read the WARNING, DISCLAIMER and PATENTS sections in the README file. This program is released under the terms of the license contained in the file LICENSE. ------------------------------------------------------------------ */ /* This program is a complete hack and should be rewritten properly. It isn't very complicated. */ #include <stdio.h> #include <errno.h> #include <stdlib.h> #include <string.h> #include <bzlib.h> #include "bzlib_private.h" /* This program records bit locations in the file to be recovered. That means that if 64-bit ints are not supported, we will not be able to recover .bz2 files over 512MB (2^32 bits) long. On GNU supported platforms, we take advantage of the 64-bit int support to circumvent this problem. Ditto MSVC. This change occurred in version 1.0.2; all prior versions have the 512MB limitation. */ #ifdef __GNUC__ typedef unsigned long long int MaybeUInt64; # define MaybeUInt64_FMT "%Lu" #else #ifdef _MSC_VER typedef unsigned __int64 MaybeUInt64; # define MaybeUInt64_FMT "%I64u" #else typedef unsigned int MaybeUInt64; # define MaybeUInt64_FMT "%u" #endif #endif #define True ((Bool)1) #define False ((Bool)0) #define BZ_MAX_FILENAME 2000 Char inFileName[BZ_MAX_FILENAME]; Char outFileName[BZ_MAX_FILENAME]; Char progName[BZ_MAX_FILENAME]; MaybeUInt64 bytesOut = 0; MaybeUInt64 bytesIn = 0; /*---------------------------------------------------*/ /*--- Header bytes ---*/ /*---------------------------------------------------*/ #define BZ_HDR_B 0x42 /* 'B' */ #define BZ_HDR_Z 0x5a /* 'Z' */ #define BZ_HDR_h 0x68 /* 'h' */ #define BZ_HDR_0 0x30 /* '0' */ /*---------------------------------------------------*/ /*--- I/O errors ---*/ /*---------------------------------------------------*/ /*---------------------------------------------*/ static void readError ( void ) { fprintf ( stderr, "%s: I/O error reading `%s', possible reason follows.\n", progName, inFileName ); perror ( progName ); fprintf ( stderr, "%s: warning: output file(s) may be incomplete.\n", progName ); exit ( 1 ); } /*---------------------------------------------*/ static void writeError ( void ) { fprintf ( stderr, "%s: I/O error reading `%s', possible reason follows.\n", progName, inFileName ); perror ( progName ); fprintf ( stderr, "%s: warning: output file(s) may be incomplete.\n", progName ); exit ( 1 ); } /*---------------------------------------------*/ static void mallocFail ( Int32 n ) { fprintf ( stderr, "%s: malloc failed on request for %d bytes.\n", progName, n ); fprintf ( stderr, "%s: warning: output file(s) may be incomplete.\n", progName ); exit ( 1 ); } /*---------------------------------------------*/ static void tooManyBlocks ( Int32 max_handled_blocks ) { fprintf ( stderr, "%s: `%s' appears to contain more than %d blocks\n", progName, inFileName, max_handled_blocks ); fprintf ( stderr, "%s: and cannot be handled. To fix, increase\n", progName ); fprintf ( stderr, "%s: BZ_MAX_HANDLED_BLOCKS in bzip2recover.c, and recompile.\n", progName ); exit ( 1 ); } /*---------------------------------------------------*/ /*--- Bit stream I/O ---*/ /*---------------------------------------------------*/ typedef struct { FILE* handle; Int32 buffer; Int32 buffLive; Char mode; } BitStream; /*---------------------------------------------*/ static BitStream* bsOpenReadStream ( FILE* stream ) { BitStream *bs = malloc ( sizeof(BitStream) ); if (bs == NULL) mallocFail ( sizeof(BitStream) ); bs->handle = stream; bs->buffer = 0; bs->buffLive = 0; bs->mode = 'r'; return bs; } /*---------------------------------------------*/ /*-- Returns 0 or 1, or 2 to indicate EOF. --*/ static Int32 bsGetBit ( BitStream* bs ) { if (bs->buffLive > 0) { bs->buffLive --; return ( ((bs->buffer) >> (bs->buffLive)) & 0x1 ); } else { Int32 retVal = getc ( bs->handle ); if ( retVal == EOF ) { if (errno != 0) readError(); return 2; } bs->buffLive = 7; bs->buffer = retVal; return ( ((bs->buffer) >> 7) & 0x1 ); } } /*---------------------------------------------*/ static void bsClose ( BitStream* bs ) { Int32 retVal; if ( bs->mode == 'w' ) { while ( bs->buffLive < 8 ) { bs->buffLive++; bs->buffer <<= 1; }; retVal = putc ( (UChar) (bs->buffer), bs->handle ); if (retVal == EOF) writeError(); bytesOut++; retVal = fflush ( bs->handle ); if (retVal == EOF) writeError(); } retVal = fclose ( bs->handle ); if (retVal == EOF) { if (bs->mode == 'w') writeError(); else readError(); } free ( bs ); } /*---------------------------------------------*/ // static void bsPutUChar ( BitStream* bs, UChar c ) // { // Int32 i; // for (i = 7; i >= 0; i--) // bsPutBit ( bs, (((UInt32) c) >> i) & 0x1 ); // } /*---------------------------------------------*/ // static void bsPutUInt32 ( BitStream* bs, UInt32 c ) // { // Int32 i; // // for (i = 31; i >= 0; i--) // bsPutBit ( bs, (c >> i) & 0x1 ); // } /*---------------------------------------------*/ static Bool endsInBz2 ( Char* name ) { Int32 n = strlen ( name ); if (n <= 4) return False; return (name[n-4] == '.' && name[n-3] == 'b' && name[n-2] == 'z' && name[n-1] == '2'); } /***********************************************/ struct bz_stream_seekable { unsigned char head[4]; bz_stream stream; FILE *f; }; /** Init Bz2 Stream */ int init_bs_stream(struct bz_stream_seekable *s, FILE *f) { int err; s->f = f; fseek(s->f, 0, SEEK_SET); fread(s->head, 4, 1, s->f); return 1; } /** seek to a BZ2 block */ int bs_block_seek(struct bz_stream_seekable *s, MaybeUInt64 pos) { long here = pos/8; int err; int ret = fseek(s->f, here, SEEK_SET); int remaining_bit = 0; memset(&s->stream, 0, sizeof(bz_stream)); s->stream.next_in = 0; s->stream.avail_in = 0; BZ2_bzDecompressInit(&s->stream, 0,0); s->stream.next_in = s->head; s->stream.avail_in = 4; // decompress header: err = BZ2_bzDecompress (&s->stream); if(err != BZ_OK) { fprintf(stderr, "head_err: %d\n", err); } remaining_bit = pos & 0x7; if(remaining_bit != 0) { unsigned int c = fgetc(s->f); unsigned int bit = 8 - remaining_bit; DState *ds = (DState *) s->stream.state; // printf("%u %u\n", s->bsLive, s->bsBuff); ds->bsBuff = (ds->bsBuff<<bit) | c; ds->bsLive = bit; // printf("%u bit readed\n", bit); } return ret; } /*---------------------------------------------------*/ /*--- ---*/ /*---------------------------------------------------*/ /* This logic isn't really right when it comes to Cygwin. */ #ifdef _WIN32 # define BZ_SPLIT_SYM '\\' /* path splitter on Windows platform */ #else # define BZ_SPLIT_SYM '/' /* path splitter on Unix platform */ #endif #define BLOCK_HEADER_HI 0x00003141UL #define BLOCK_HEADER_LO 0x59265359UL #define BLOCK_ENDMARK_HI 0x00001772UL #define BLOCK_ENDMARK_LO 0x45385090UL /* Increase if necessary. However, a .bz2 file with > 50000 blocks would have an uncompressed size of at least 40GB, so the chances are low you'll need to up this. */ #define BZ_MAX_HANDLED_BLOCKS 50000 MaybeUInt64 bStart [BZ_MAX_HANDLED_BLOCKS]; MaybeUInt64 bEnd [BZ_MAX_HANDLED_BLOCKS]; MaybeUInt64 rbStart[BZ_MAX_HANDLED_BLOCKS]; MaybeUInt64 rbEnd [BZ_MAX_HANDLED_BLOCKS]; void seek_and_decode_test( FILE* inFile, Int32 rbCtr) { /***** SEEKEING AND DECODING ****************/ struct bz_stream_seekable stream; int i; int err; // initialize seekable bzstream init_bs_stream(&stream, inFile); // for any block: for(i=0;i<rbCtr;++i) { char *inbuffer = malloc(65536); char *outbuffer = malloc(4*65536); // printf("Block %u: from "MaybeUInt64_FMT " to " MaybeUInt64_FMT"\n", i, rbStart[i], rbEnd[i]); // 48 BIT HEADER: bs_block_seek(&stream, rbStart[i] - (MaybeUInt64)48); stream.stream.next_out = outbuffer; stream.stream.avail_out = 4*65536; stream.stream.total_out_lo32 = 0; while(stream.stream.avail_out>0) { if(stream.stream.avail_in==0) { stream.stream.next_in = inbuffer; if((stream.stream.avail_in = fread(stream.stream.next_in, 1, 65536, inFile))==0) { fprintf(stderr, "EOF\n"); break; /* EOF */ } } err = BZ2_bzDecompress (&stream.stream); if(err != BZ_OK) { fprintf(stderr, "err: %d\n", err); } if(err==BZ_STREAM_END) { break; } } BZ2_bzDecompressEnd(&stream.stream); free(inbuffer); free(outbuffer); } } Int32 main ( Int32 argc, Char** argv ) { FILE* inFile; FILE* outFile; BitStream* bsIn; /* , *bsWr; */ Int32 b, wrBlock, currBlock, rbCtr; MaybeUInt64 bitsRead; UInt32 buffHi, buffLo, blockCRC; Char* p; strcpy ( progName, argv[0] ); inFileName[0] = outFileName[0] = 0; fprintf ( stderr, "bzip2indexer 1.0.5: extracts blocks boundaries in .bz2 files.\n" ); if (argc != 2) { fprintf ( stderr, "%s: usage is `%s damaged_file_name'.\n", progName, progName ); switch (sizeof(MaybeUInt64)) { case 8: fprintf(stderr, "\trestrictions on size of recovered file: None\n"); break; case 4: fprintf(stderr, "\trestrictions on size of recovered file: 512 MB\n"); fprintf(stderr, "\tto circumvent, recompile with MaybeUInt64 as an\n" "\tunsigned 64-bit int.\n"); break; default: fprintf(stderr, "\tsizeof(MaybeUInt64) is not 4 or 8 -- " "configuration error.\n"); break; } exit(1); } if (strlen(argv[1]) >= BZ_MAX_FILENAME-20) { fprintf ( stderr, "%s: supplied filename is suspiciously (>= %d chars) long. Bye!\n", progName, (int)strlen(argv[1]) ); exit(1); } strcpy ( inFileName, argv[1] ); inFile = fopen ( inFileName, "rb" ); if (inFile == NULL) { fprintf ( stderr, "%s: can't read `%s'\n", progName, inFileName ); exit(1); } bsIn = bsOpenReadStream ( inFile ); // fprintf ( stderr, "%s: searching for block boundaries ...\n", progName ); bitsRead = 0; buffHi = buffLo = 0; currBlock = 0; bStart[currBlock] = 0; rbCtr = 0; while (True) { // while(rbCtr<2) { // read a single bit (0,1). 2 is EOF b = bsGetBit ( bsIn ); bitsRead++; // EOF? if (b == 2) { if (bitsRead >= bStart[currBlock] && (bitsRead - bStart[currBlock]) >= 40) { bEnd[currBlock] = bitsRead-1; if (currBlock > 0) fprintf ( stderr, "Warning: block %d runs from " MaybeUInt64_FMT " to " MaybeUInt64_FMT " (incomplete)\n", currBlock, bStart[currBlock], bEnd[currBlock] ); } else currBlock--; break; } // shift buffHi,buffLo buffHi = (buffHi << 1) | (buffLo >> 31); buffLo = (buffLo << 1) | (b & 1); if ( ( (buffHi & 0x0000ffff) == BLOCK_HEADER_HI && buffLo == BLOCK_HEADER_LO) || ( (buffHi & 0x0000ffff) == BLOCK_ENDMARK_HI && buffLo == BLOCK_ENDMARK_LO) ) { if (bitsRead > 49) { bEnd[currBlock] = bitsRead-49; } else { bEnd[currBlock] = 0; } if (currBlock > 0 && (bEnd[currBlock] - bStart[currBlock]) >= 130) { fprintf ( stdout, " " MaybeUInt64_FMT " " MaybeUInt64_FMT "\n", bStart[currBlock], bEnd[currBlock]); rbStart[rbCtr] = bStart[currBlock]; rbEnd[rbCtr] = bEnd[currBlock]; rbCtr++; } if (currBlock >= BZ_MAX_HANDLED_BLOCKS) tooManyBlocks(BZ_MAX_HANDLED_BLOCKS); currBlock++; bStart[currBlock] = bitsRead; } } // bsClose ( bsIn ); // Test Seeking: seek_and_decode_test(inFile, rbCtr); return 0; } /* 00000000 42 5a 68 39 31 41 59 26 53 59 e3 e4 52 0f 01 cc |BZh91AY&SY..R...| 00000010 18 5f ff 74 10 7f ff ff ff bf ff ff fa bf ff ff |._.t............| */ /*-----------------------------------------------------------*/ /*--- end bzip2indexer.c ---*/ /*-----------------------------------------------------------*/