• Main Page
  • Related Pages
  • Modules
  • Namespaces
  • Data Structures
  • Files
  • File List
  • Globals

contrib/zlib/examples/zran.c

00001 /* zran.c -- example of zlib/gzip stream indexing and random access
00002  * Copyright (C) 2005 Mark Adler
00003  * For conditions of distribution and use, see copyright notice in zlib.h
00004    Version 1.0  29 May 2005  Mark Adler */
00005 
00006 /* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
00007    for random access of a compressed file.  A file containing a zlib or gzip
00008    stream is provided on the command line.  The compressed stream is decoded in
00009    its entirety, and an index built with access points about every SPAN bytes
00010    in the uncompressed output.  The compressed file is left open, and can then
00011    be read randomly, having to decompress on the average SPAN/2 uncompressed
00012    bytes before getting to the desired block of data.
00013 
00014    An access point can be created at the start of any deflate block, by saving
00015    the starting file offset and bit of that block, and the 32K bytes of
00016    uncompressed data that precede that block.  Also the uncompressed offset of
00017    that block is saved to provide a referece for locating a desired starting
00018    point in the uncompressed stream.  build_index() works by decompressing the
00019    input zlib or gzip stream a block at a time, and at the end of each block
00020    deciding if enough uncompressed data has gone by to justify the creation of
00021    a new access point.  If so, that point is saved in a data structure that
00022    grows as needed to accommodate the points.
00023 
00024    To use the index, an offset in the uncompressed data is provided, for which
00025    the latest accees point at or preceding that offset is located in the index.
00026    The input file is positioned to the specified location in the index, and if
00027    necessary the first few bits of the compressed data is read from the file.
00028    inflate is initialized with those bits and the 32K of uncompressed data, and
00029    the decompression then proceeds until the desired offset in the file is
00030    reached.  Then the decompression continues to read the desired uncompressed
00031    data from the file.
00032 
00033    Another approach would be to generate the index on demand.  In that case,
00034    requests for random access reads from the compressed data would try to use
00035    the index, but if a read far enough past the end of the index is required,
00036    then further index entries would be generated and added.
00037 
00038    There is some fair bit of overhead to starting inflation for the random
00039    access, mainly copying the 32K byte dictionary.  So if small pieces of the
00040    file are being accessed, it would make sense to implement a cache to hold
00041    some lookahead and avoid many calls to extract() for small lengths.
00042 
00043    Another way to build an index would be to use inflateCopy().  That would
00044    not be constrained to have access points at block boundaries, but requires
00045    more memory per access point, and also cannot be saved to file due to the
00046    use of pointers in the state.  The approach here allows for storage of the
00047    index in a file.
00048  */
00049 
00050 #include <stdio.h>
00051 #include <stdlib.h>
00052 #include <string.h>
00053 #include "zlib.h"
00054 
00055 #define local static
00056 
00057 #define SPAN 1048576L       /* desired distance between access points */
00058 #define WINSIZE 32768U      /* sliding window size */
00059 #define CHUNK 16384         /* file input buffer size */
00060 
00061 /* access point entry */
00062 struct point {
00063     off_t out;          /* corresponding offset in uncompressed data */
00064     off_t in;           /* offset in input file of first full byte */
00065     int bits;           /* number of bits (1-7) from byte at in - 1, or 0 */
00066     unsigned char window[WINSIZE];  /* preceding 32K of uncompressed data */
00067 };
00068 
00069 /* access point list */
00070 struct access {
00071     int have;           /* number of list entries filled in */
00072     int size;           /* number of list entries allocated */
00073     struct point *list; /* allocated list */
00074 };
00075 
00076 /* Deallocate an index built by build_index() */
00077 local void free_index(struct access *index)
00078 {
00079     if (index != NULL) {
00080         free(index->list);
00081         free(index);
00082     }
00083 }
00084 
00085 /* Add an entry to the access point list.  If out of memory, deallocate the
00086    existing list and return NULL. */
00087 local struct access *addpoint(struct access *index, int bits,
00088     off_t in, off_t out, unsigned left, unsigned char *window)
00089 {
00090     struct point *next;
00091 
00092     /* if list is empty, create it (start with eight points) */
00093     if (index == NULL) {
00094         index = malloc(sizeof(struct access));
00095         if (index == NULL) return NULL;
00096         index->list = malloc(sizeof(struct point) << 3);
00097         if (index->list == NULL) {
00098             free(index);
00099             return NULL;
00100         }
00101         index->size = 8;
00102         index->have = 0;
00103     }
00104 
00105     /* if list is full, make it bigger */
00106     else if (index->have == index->size) {
00107         index->size <<= 1;
00108         next = realloc(index->list, sizeof(struct point) * index->size);
00109         if (next == NULL) {
00110             free_index(index);
00111             return NULL;
00112         }
00113         index->list = next;
00114     }
00115 
00116     /* fill in entry and increment how many we have */
00117     next = index->list + index->have;
00118     next->bits = bits;
00119     next->in = in;
00120     next->out = out;
00121     if (left)
00122         memcpy(next->window, window + WINSIZE - left, left);
00123     if (left < WINSIZE)
00124         memcpy(next->window + left, window, WINSIZE - left);
00125     index->have++;
00126 
00127     /* return list, possibly reallocated */
00128     return index;
00129 }
00130 
00131 /* Make one entire pass through the compressed stream and build an index, with
00132    access points about every span bytes of uncompressed output -- span is
00133    chosen to balance the speed of random access against the memory requirements
00134    of the list, about 32K bytes per access point.  Note that data after the end
00135    of the first zlib or gzip stream in the file is ignored.  build_index()
00136    returns the number of access points on success (>= 1), Z_MEM_ERROR for out
00137    of memory, Z_DATA_ERROR for an error in the input file, or Z_ERRNO for a
00138    file read error.  On success, *built points to the resulting index. */
00139 local int build_index(FILE *in, off_t span, struct access **built)
00140 {
00141     int ret;
00142     off_t totin, totout;        /* our own total counters to avoid 4GB limit */
00143     off_t last;                 /* totout value of last access point */
00144     struct access *index;       /* access points being generated */
00145     z_stream strm;
00146     unsigned char input[CHUNK];
00147     unsigned char window[WINSIZE];
00148 
00149     /* initialize inflate */
00150     strm.zalloc = Z_NULL;
00151     strm.zfree = Z_NULL;
00152     strm.opaque = Z_NULL;
00153     strm.avail_in = 0;
00154     strm.next_in = Z_NULL;
00155     ret = inflateInit2(&strm, 47);      /* automatic zlib or gzip decoding */
00156     if (ret != Z_OK)
00157         return ret;
00158 
00159     /* inflate the input, maintain a sliding window, and build an index -- this
00160        also validates the integrity of the compressed data using the check
00161        information at the end of the gzip or zlib stream */
00162     totin = totout = last = 0;
00163     index = NULL;               /* will be allocated by first addpoint() */
00164     strm.avail_out = 0;
00165     do {
00166         /* get some compressed data from input file */
00167         strm.avail_in = fread(input, 1, CHUNK, in);
00168         if (ferror(in)) {
00169             ret = Z_ERRNO;
00170             goto build_index_error;
00171         }
00172         if (strm.avail_in == 0) {
00173             ret = Z_DATA_ERROR;
00174             goto build_index_error;
00175         }
00176         strm.next_in = input;
00177 
00178         /* process all of that, or until end of stream */
00179         do {
00180             /* reset sliding window if necessary */
00181             if (strm.avail_out == 0) {
00182                 strm.avail_out = WINSIZE;
00183                 strm.next_out = window;
00184             }
00185 
00186             /* inflate until out of input, output, or at end of block --
00187                update the total input and output counters */
00188             totin += strm.avail_in;
00189             totout += strm.avail_out;
00190             ret = inflate(&strm, Z_BLOCK);      /* return at end of block */
00191             totin -= strm.avail_in;
00192             totout -= strm.avail_out;
00193             if (ret == Z_NEED_DICT)
00194                 ret = Z_DATA_ERROR;
00195             if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
00196                 goto build_index_error;
00197             if (ret == Z_STREAM_END)
00198                 break;
00199 
00200             /* if at end of block, consider adding an index entry (note that if
00201                data_type indicates an end-of-block, then all of the
00202                uncompressed data from that block has been delivered, and none
00203                of the compressed data after that block has been consumed,
00204                except for up to seven bits) -- the totout == 0 provides an
00205                entry point after the zlib or gzip header, and assures that the
00206                index always has at least one access point; we avoid creating an
00207                access point after the last block by checking bit 6 of data_type
00208              */
00209             if ((strm.data_type & 128) && !(strm.data_type & 64) &&
00210                 (totout == 0 || totout - last > span)) {
00211                 index = addpoint(index, strm.data_type & 7, totin,
00212                                  totout, strm.avail_out, window);
00213                 if (index == NULL) {
00214                     ret = Z_MEM_ERROR;
00215                     goto build_index_error;
00216                 }
00217                 last = totout;
00218             }
00219         } while (strm.avail_in != 0);
00220     } while (ret != Z_STREAM_END);
00221 
00222     /* clean up and return index (release unused entries in list) */
00223     (void)inflateEnd(&strm);
00224     index = realloc(index, sizeof(struct point) * index->have);
00225     index->size = index->have;
00226     *built = index;
00227     return index->size;
00228 
00229     /* return error */
00230   build_index_error:
00231     (void)inflateEnd(&strm);
00232     if (index != NULL)
00233         free_index(index);
00234     return ret;
00235 }
00236 
00237 /* Use the index to read len bytes from offset into buf, return bytes read or
00238    negative for error (Z_DATA_ERROR or Z_MEM_ERROR).  If data is requested past
00239    the end of the uncompressed data, then extract() will return a value less
00240    than len, indicating how much as actually read into buf.  This function
00241    should not return a data error unless the file was modified since the index
00242    was generated.  extract() may also return Z_ERRNO if there is an error on
00243    reading or seeking the input file. */
00244 local int extract(FILE *in, struct access *index, off_t offset,
00245                   unsigned char *buf, int len)
00246 {
00247     int ret, skip;
00248     z_stream strm;
00249     struct point *here;
00250     unsigned char input[CHUNK];
00251     unsigned char discard[WINSIZE];
00252 
00253     /* proceed only if something reasonable to do */
00254     if (len < 0)
00255         return 0;
00256 
00257     /* find where in stream to start */
00258     here = index->list;
00259     ret = index->have;
00260     while (--ret && here[1].out <= offset)
00261         here++;
00262 
00263     /* initialize file and inflate state to start there */
00264     strm.zalloc = Z_NULL;
00265     strm.zfree = Z_NULL;
00266     strm.opaque = Z_NULL;
00267     strm.avail_in = 0;
00268     strm.next_in = Z_NULL;
00269     ret = inflateInit2(&strm, -15);         /* raw inflate */
00270     if (ret != Z_OK)
00271         return ret;
00272     ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
00273     if (ret == -1)
00274         goto extract_ret;
00275     if (here->bits) {
00276         ret = getc(in);
00277         if (ret == -1) {
00278             ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
00279             goto extract_ret;
00280         }
00281         (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
00282     }
00283     (void)inflateSetDictionary(&strm, here->window, WINSIZE);
00284 
00285     /* skip uncompressed bytes until offset reached, then satisfy request */
00286     offset -= here->out;
00287     strm.avail_in = 0;
00288     skip = 1;                               /* while skipping to offset */
00289     do {
00290         /* define where to put uncompressed data, and how much */
00291         if (offset == 0 && skip) {          /* at offset now */
00292             strm.avail_out = len;
00293             strm.next_out = buf;
00294             skip = 0;                       /* only do this once */
00295         }
00296         if (offset > WINSIZE) {             /* skip WINSIZE bytes */
00297             strm.avail_out = WINSIZE;
00298             strm.next_out = discard;
00299             offset -= WINSIZE;
00300         }
00301         else if (offset != 0) {             /* last skip */
00302             strm.avail_out = (unsigned)offset;
00303             strm.next_out = discard;
00304             offset = 0;
00305         }
00306 
00307         /* uncompress until avail_out filled, or end of stream */
00308         do {
00309             if (strm.avail_in == 0) {
00310                 strm.avail_in = fread(input, 1, CHUNK, in);
00311                 if (ferror(in)) {
00312                     ret = Z_ERRNO;
00313                     goto extract_ret;
00314                 }
00315                 if (strm.avail_in == 0) {
00316                     ret = Z_DATA_ERROR;
00317                     goto extract_ret;
00318                 }
00319                 strm.next_in = input;
00320             }
00321             ret = inflate(&strm, Z_NO_FLUSH);       /* normal inflate */
00322             if (ret == Z_NEED_DICT)
00323                 ret = Z_DATA_ERROR;
00324             if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
00325                 goto extract_ret;
00326             if (ret == Z_STREAM_END)
00327                 break;
00328         } while (strm.avail_out != 0);
00329 
00330         /* if reach end of stream, then don't keep trying to get more */
00331         if (ret == Z_STREAM_END)
00332             break;
00333 
00334         /* do until offset reached and requested data read, or stream ends */
00335     } while (skip);
00336 
00337     /* compute number of uncompressed bytes read after offset */
00338     ret = skip ? 0 : len - strm.avail_out;
00339 
00340     /* clean up and return bytes read or error */
00341   extract_ret:
00342     (void)inflateEnd(&strm);
00343     return ret;
00344 }
00345 
00346 /* Demonstrate the use of build_index() and extract() by processing the file
00347    provided on the command line, and the extracting 16K from about 2/3rds of
00348    the way through the uncompressed output, and writing that to stdout. */
00349 int main(int argc, char **argv)
00350 {
00351     int len;
00352     off_t offset;
00353     FILE *in;
00354     struct access *index = NULL;
00355     unsigned char buf[CHUNK];
00356 
00357     /* open input file */
00358     if (argc != 2) {
00359         fprintf(stderr, "usage: zran file.gz\n");
00360         return 1;
00361     }
00362     in = fopen(argv[1], "rb");
00363     if (in == NULL) {
00364         fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
00365         return 1;
00366     }
00367 
00368     /* build index */
00369     len = build_index(in, SPAN, &index);
00370     if (len < 0) {
00371         fclose(in);
00372         switch (len) {
00373         case Z_MEM_ERROR:
00374             fprintf(stderr, "zran: out of memory\n");
00375             break;
00376         case Z_DATA_ERROR:
00377             fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
00378             break;
00379         case Z_ERRNO:
00380             fprintf(stderr, "zran: read error on %s\n", argv[1]);
00381             break;
00382         default:
00383             fprintf(stderr, "zran: error %d while building index\n", len);
00384         }
00385         return 1;
00386     }
00387     fprintf(stderr, "zran: built index with %d access points\n", len);
00388 
00389     /* use index by reading some bytes from an arbitrary offset */
00390     offset = (index->list[index->have - 1].out << 1) / 3;
00391     len = extract(in, index, offset, buf, CHUNK);
00392     if (len < 0)
00393         fprintf(stderr, "zran: extraction failed: %s error\n",
00394                 len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
00395     else {
00396         fwrite(buf, 1, len, stdout);
00397         fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
00398     }
00399 
00400     /* clean up and exit */
00401     free_index(index);
00402     fclose(in);
00403     return 0;
00404 }

Generated on Wed Oct 20 2010 11:12:17 for APBS by  doxygen 1.7.2