bz2lib/blocksort.c

   1
   2 /*-------------------------------------------------------------*/
   3 /*--- Block sorting machinery                               ---*/
   4 /*---                                           blocksort.c ---*/
   5 /*-------------------------------------------------------------*/
   6
   7 /*--
   8   This file is a part of bzip2 and/or libbzip2, a program and
   9   library for lossless, block-sorting data compression.
  10
  11   Copyright (C) 1996-2000 Julian R Seward.  All rights reserved.
  12
  13   Redistribution and use in source and binary forms, with or without
  14   modification, are permitted provided that the following conditions
  15   are met:
  16
  17   1. Redistributions of source code must retain the above copyright
  18      notice, this list of conditions and the following disclaimer.
  19
  20   2. The origin of this software must not be misrepresented; you must
  21      not claim that you wrote the original software.  If you use this
  22      software in a product, an acknowledgment in the product
  23      documentation would be appreciated but is not required.
  24
  25   3. Altered source versions must be plainly marked as such, and must
  26      not be misrepresented as being the original software.
  27
  28   4. The name of the author may not be used to endorse or promote
  29      products derived from this software without specific prior written
  30      permission.
  31
  32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43
  44   Julian Seward, Cambridge, UK.
  45   jseward@acm.org
  46   bzip2/libbzip2 version 1.0 of 21 March 2000
  47
  48   This program is based on (at least) the work of:
  49      Mike Burrows
  50      David Wheeler
  51      Peter Fenwick
  52      Alistair Moffat
  53      Radford Neal
  54      Ian H. Witten
  55      Robert Sedgewick
  56      Jon L. Bentley
  57
  58   For more information on these sources, see the manual.
  59
  60   To get some idea how the block sorting algorithms in this file
  61   work, read my paper
  62      On the Performance of BWT Sorting Algorithms
  63   in Proceedings of the IEEE Data Compression Conference 2000,
  64   Snowbird, Utah, USA, 27-30 March 2000.  The main sort in this
  65   file implements the algorithm called  cache  in the paper.
  66 --*/
  67
  68
  69 #include "bzlib_private.h"
  70
  71 /*---------------------------------------------*/
  72 /*--- Fallback O(N log(N)^2) sorting        ---*/
  73 /*--- algorithm, for repetitive blocks      ---*/
  74 /*---------------------------------------------*/
  75
  76 /*---------------------------------------------*/
  77 static
  78 __inline__
  79 void fallbackSimpleSort ( UInt32* fmap,
  80                           UInt32* eclass,
  81                           Int32   lo,
  82                           Int32   hi )
  83 {
  84    Int32 i, j, tmp;
  85    UInt32 ec_tmp;
  86
  87    if (lo == hi) return;
  88
  89    if (hi - lo > 3) {
  90       for ( i = hi-4; i >= lo; i-- ) {
  91          tmp = fmap[i];
  92          ec_tmp = eclass[tmp];
  93          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
  94             fmap[j-4] = fmap[j];
  95          fmap[j-4] = tmp;
  96       }
  97    }
  98
  99    for ( i = hi-1; i >= lo; i-- ) {
 100       tmp = fmap[i];
 101       ec_tmp = eclass[tmp];
 102       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
 103          fmap[j-1] = fmap[j];
 104       fmap[j-1] = tmp;
 105    }
 106 }
 107
 108
 109 /*---------------------------------------------*/
 110 #define fswap(zz1, zz2) \
 111    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 112
 113 #define fvswap(zzp1, zzp2, zzn)       \
 114 {                                     \
 115    Int32 yyp1 = (zzp1);               \
 116    Int32 yyp2 = (zzp2);               \
 117    Int32 yyn  = (zzn);                \
 118    while (yyn > 0) {                  \
 119       fswap(fmap[yyp1], fmap[yyp2]);  \
 120       yyp1++; yyp2++; yyn--;          \
 121    }                                  \
 122 }
 123
 124
 125 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
 126
 127 #define fpush(lz,hz) { stackLo[sp] = lz; \
 128                        stackHi[sp] = hz; \
 129                        sp++; }
 130
 131 #define fpop(lz,hz) { sp--;              \
 132                       lz = stackLo[sp];  \
 133                       hz = stackHi[sp]; }
 134
 135 #define FALLBACK_QSORT_SMALL_THRESH 10
 136 #define FALLBACK_QSORT_STACK_SIZE   100
 137
 138
 139 static
 140 void fallbackQSort3 ( UInt32* fmap,
 141                       UInt32* eclass,
 142                       Int32   loSt,
 143                       Int32   hiSt )
 144 {
 145    Int32 unLo, unHi, ltLo, gtHi, n, m;
 146    Int32 sp, lo, hi;
 147    UInt32 med, r, r3;
 148    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
 149    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
 150
 151    r = 0;
 152
 153    sp = 0;
 154    fpush ( loSt, hiSt );
 155
 156    while (sp > 0) {
 157
 158       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
 159
 160       fpop ( lo, hi );
 161       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 162          fallbackSimpleSort ( fmap, eclass, lo, hi );
 163          continue;
 164       }
 165
 166       /* Random partitioning.  Median of 3 sometimes fails to
 167          avoid bad cases.  Median of 9 seems to help but
 168          looks rather expensive.  This too seems to work but
 169          is cheaper.  Guidance for the magic constants
 170          7621 and 32768 is taken from Sedgewick's algorithms
 171          book, chapter 35.
 172       */
 173       r = ((r * 7621) + 1) % 32768;
 174       r3 = r % 3;
 175       if (r3 == 0) med = eclass[fmap[lo]]; else
 176       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
 177                    med = eclass[fmap[hi]];
 178
 179       unLo = ltLo = lo;
 180       unHi = gtHi = hi;
 181
 182       while (1) {
 183          while (1) {
 184             if (unLo > unHi) break;
 185             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
 186             if (n == 0) {
 187                fswap(fmap[unLo], fmap[ltLo]);
 188                ltLo++; unLo++;
 189                continue;
 190             };
 191             if (n > 0) break;
 192             unLo++;
 193          }
 194          while (1) {
 195             if (unLo > unHi) break;
 196             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
 197             if (n == 0) {
 198                fswap(fmap[unHi], fmap[gtHi]);
 199                gtHi--; unHi--;
 200                continue;
 201             };
 202             if (n < 0) break;
 203             unHi--;
 204          }
 205          if (unLo > unHi) break;
 206          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 207       }
 208
 209       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
 210
 211       if (gtHi < ltLo) continue;
 212
 213       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
 214       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
 215
 216       n = lo + unLo - ltLo - 1;
 217       m = hi - (gtHi - unHi) + 1;
 218
 219       if (n - lo > hi - m) {
 220          fpush ( lo, n );
 221          fpush ( m, hi );
 222       } else {
 223          fpush ( m, hi );
 224          fpush ( lo, n );
 225       }
 226    }
 227 }
 228
 229 #undef fmin
 230 #undef fpush
 231 #undef fpop
 232 #undef fswap
 233 #undef fvswap
 234 #undef FALLBACK_QSORT_SMALL_THRESH
 235 #undef FALLBACK_QSORT_STACK_SIZE
 236
 237
 238 /*---------------------------------------------*/
 239 /* Pre:
 240       nblock > 0
 241       eclass exists for [0 .. nblock-1]
 242       ((UChar*)eclass) [0 .. nblock-1] holds block
 243       ptr exists for [0 .. nblock-1]
 244
 245    Post:
 246       ((UChar*)eclass) [0 .. nblock-1] holds block
 247       All other areas of eclass destroyed
 248       fmap [0 .. nblock-1] holds sorted order
 249       bhtab [ 0 .. 2+(nblock/32) ] destroyed
 250 */
 251
 252 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 253 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 254 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 255 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
 256 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 257
 258 static
 259 void fallbackSort ( UInt32* fmap,
 260                     UInt32* eclass,
 261                     UInt32* bhtab,
 262                     Int32   nblock,
 263                     Int32   verb )
 264 {
 265    Int32 ftab[257];
 266    Int32 ftabCopy[256];
 267    Int32 H, i, j, k, l, r, cc, cc1;
 268    Int32 nNotDone;
 269    Int32 nBhtab;
 270    UChar* eclass8 = (UChar*)eclass;
 271
 272    /*--
 273       Initial 1-char radix sort to generate
 274       initial fmap and initial BH bits.
 275    --*/
 276    if (verb >= 4)
 277       VPrintf0 ( "        bucket sorting ...\n" );
 278    for (i = 0; i < 257;    i++) ftab[i] = 0;
 279    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 280    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 281    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
 282
 283    for (i = 0; i < nblock; i++) {
 284       j = eclass8[i];
 285       k = ftab[j] - 1;
 286       ftab[j] = k;
 287       fmap[k] = i;
 288    }
 289
 290    nBhtab = 2 + (nblock / 32);
 291    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 292    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 293
 294    /*--
 295       Inductively refine the buckets.  Kind-of an
 296       "exponential radix sort" (!), inspired by the
 297       Manber-Myers suffix array construction algorithm.
 298    --*/
 299
 300    /*-- set sentinel bits for block-end detection --*/
 301    for (i = 0; i < 32; i++) {
 302       SET_BH(nblock + 2*i);
 303       CLEAR_BH(nblock + 2*i + 1);
 304    }
 305
 306    /*-- the log(N) loop --*/
 307    H = 1;
 308    while (1) {
 309
 310       if (verb >= 4)
 311          VPrintf1 ( "        depth %6d has ", H );
 312
 313       j = 0;
 314       for (i = 0; i < nblock; i++) {
 315          if (ISSET_BH(i)) j = i;
 316          k = fmap[i] - H; if (k < 0) k += nblock;
 317          eclass[k] = j;
 318       }
 319
 320       nNotDone = 0;
 321       r = -1;
 322       while (1) {
 323
 324          /*-- find the next non-singleton bucket --*/
 325          k = r + 1;
 326          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 327          if (ISSET_BH(k)) {
 328             while (WORD_BH(k) == 0xffffffff) k += 32;
 329             while (ISSET_BH(k)) k++;
 330          }
 331          l = k - 1;
 332          if (l >= nblock) break;
 333          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 334          if (!ISSET_BH(k)) {
 335             while (WORD_BH(k) == 0x00000000) k += 32;
 336             while (!ISSET_BH(k)) k++;
 337          }
 338          r = k - 1;
 339          if (r >= nblock) break;
 340
 341          /*-- now [l, r] bracket current bucket --*/
 342          if (r > l) {
 343             nNotDone += (r - l + 1);
 344             fallbackQSort3 ( fmap, eclass, l, r );
 345
 346             /*-- scan bucket and generate header bits-- */
 347             cc = -1;
 348             for (i = l; i <= r; i++) {
 349                cc1 = eclass[fmap[i]];
 350                if (cc != cc1) { SET_BH(i); cc = cc1; };
 351             }
 352          }
 353       }
 354
 355       if (verb >= 4)
 356          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
 357
 358       H *= 2;
 359       if (H > nblock || nNotDone == 0) break;
 360    }
 361
 362    /*--
 363       Reconstruct the original block in
 364       eclass8 [0 .. nblock-1], since the
 365       previous phase destroyed it.
 366    --*/
 367    if (verb >= 4)
 368       VPrintf0 ( "        reconstructing block ...\n" );
 369    j = 0;
 370    for (i = 0; i < nblock; i++) {
 371       while (ftabCopy[j] == 0) j++;
 372       ftabCopy[j]--;
 373       eclass8[fmap[i]] = (UChar)j;
 374    }
 375    AssertH ( j < 256, 1005 );
 376 }
 377
 378 #undef       SET_BH
 379 #undef     CLEAR_BH
 380 #undef     ISSET_BH
 381 #undef      WORD_BH
 382 #undef UNALIGNED_BH
 383
 384
 385 /*---------------------------------------------*/
 386 /*--- The main, O(N^2 log(N)) sorting       ---*/
 387 /*--- algorithm.  Faster for "normal"       ---*/
 388 /*--- non-repetitive blocks.                ---*/
 389 /*---------------------------------------------*/
 390
 391 /*---------------------------------------------*/
 392 static
 393 __inline__
 394 Bool mainGtU ( UInt32  i1,
 395                UInt32  i2,
 396                UChar*  block,
 397                UInt16* quadrant,
 398                UInt32  nblock,
 399                Int32*  budget )
 400 {
 401    Int32  k;
 402    UChar  c1, c2;
 403    UInt16 s1, s2;
 404
 405    AssertD ( i1 != i2, "mainGtU" );
 406    /* 1 */
 407    c1 = block[i1]; c2 = block[i2];
 408    if (c1 != c2) return (c1 > c2);
 409    i1++; i2++;
 410    /* 2 */
 411    c1 = block[i1]; c2 = block[i2];
 412    if (c1 != c2) return (c1 > c2);
 413    i1++; i2++;
 414    /* 3 */
 415    c1 = block[i1]; c2 = block[i2];
 416    if (c1 != c2) return (c1 > c2);
 417    i1++; i2++;
 418    /* 4 */
 419    c1 = block[i1]; c2 = block[i2];
 420    if (c1 != c2) return (c1 > c2);
 421    i1++; i2++;
 422    /* 5 */
 423    c1 = block[i1]; c2 = block[i2];
 424    if (c1 != c2) return (c1 > c2);
 425    i1++; i2++;
 426    /* 6 */
 427    c1 = block[i1]; c2 = block[i2];
 428    if (c1 != c2) return (c1 > c2);
 429    i1++; i2++;
 430    /* 7 */
 431    c1 = block[i1]; c2 = block[i2];
 432    if (c1 != c2) return (c1 > c2);
 433    i1++; i2++;
 434    /* 8 */
 435    c1 = block[i1]; c2 = block[i2];
 436    if (c1 != c2) return (c1 > c2);
 437    i1++; i2++;
 438    /* 9 */
 439    c1 = block[i1]; c2 = block[i2];
 440    if (c1 != c2) return (c1 > c2);
 441    i1++; i2++;
 442    /* 10 */
 443    c1 = block[i1]; c2 = block[i2];
 444    if (c1 != c2) return (c1 > c2);
 445    i1++; i2++;
 446    /* 11 */
 447    c1 = block[i1]; c2 = block[i2];
 448    if (c1 != c2) return (c1 > c2);
 449    i1++; i2++;
 450    /* 12 */
 451    c1 = block[i1]; c2 = block[i2];
 452    if (c1 != c2) return (c1 > c2);
 453    i1++; i2++;
 454
 455    k = nblock + 8;
 456
 457    do {
 458       /* 1 */
 459       c1 = block[i1]; c2 = block[i2];
 460       if (c1 != c2) return (c1 > c2);
 461       s1 = quadrant[i1]; s2 = quadrant[i2];
 462       if (s1 != s2) return (s1 > s2);
 463       i1++; i2++;
 464       /* 2 */
 465       c1 = block[i1]; c2 = block[i2];
 466       if (c1 != c2) return (c1 > c2);
 467       s1 = quadrant[i1]; s2 = quadrant[i2];
 468       if (s1 != s2) return (s1 > s2);
 469       i1++; i2++;
 470       /* 3 */
 471       c1 = block[i1]; c2 = block[i2];
 472       if (c1 != c2) return (c1 > c2);
 473       s1 = quadrant[i1]; s2 = quadrant[i2];
 474       if (s1 != s2) return (s1 > s2);
 475       i1++; i2++;
 476       /* 4 */
 477       c1 = block[i1]; c2 = block[i2];
 478       if (c1 != c2) return (c1 > c2);
 479       s1 = quadrant[i1]; s2 = quadrant[i2];
 480       if (s1 != s2) return (s1 > s2);
 481       i1++; i2++;
 482       /* 5 */
 483       c1 = block[i1]; c2 = block[i2];
 484       if (c1 != c2) return (c1 > c2);
 485       s1 = quadrant[i1]; s2 = quadrant[i2];
 486       if (s1 != s2) return (s1 > s2);
 487       i1++; i2++;
 488       /* 6 */
 489       c1 = block[i1]; c2 = block[i2];
 490       if (c1 != c2) return (c1 > c2);
 491       s1 = quadrant[i1]; s2 = quadrant[i2];
 492       if (s1 != s2) return (s1 > s2);
 493       i1++; i2++;
 494       /* 7 */
 495       c1 = block[i1]; c2 = block[i2];
 496       if (c1 != c2) return (c1 > c2);
 497       s1 = quadrant[i1]; s2 = quadrant[i2];
 498       if (s1 != s2) return (s1 > s2);
 499       i1++; i2++;
 500       /* 8 */
 501       c1 = block[i1]; c2 = block[i2];
 502       if (c1 != c2) return (c1 > c2);
 503       s1 = quadrant[i1]; s2 = quadrant[i2];
 504       if (s1 != s2) return (s1 > s2);
 505       i1++; i2++;
 506
 507       if (i1 >= nblock) i1 -= nblock;
 508       if (i2 >= nblock) i2 -= nblock;
 509
 510       k -= 8;
 511       (*budget)--;
 512    }
 513       while (k >= 0);
 514
 515    return False;
 516 }
 517
 518
 519 /*---------------------------------------------*/
 520 /*--
 521    Knuth's increments seem to work better
 522    than Incerpi-Sedgewick here.  Possibly
 523    because the number of elems to sort is
 524    usually small, typically <= 20.
 525 --*/
 526 static
 527 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
 528                    9841, 29524, 88573, 265720,
 529                    797161, 2391484 };
 530
 531 static
 532 void mainSimpleSort ( UInt32* ptr,
 533                       UChar*  block,
 534                       UInt16* quadrant,
 535                       Int32   nblock,
 536                       Int32   lo,
 537                       Int32   hi,
 538                       Int32   d,
 539                       Int32*  budget )
 540 {
 541    Int32 i, j, h, bigN, hp;
 542    UInt32 v;
 543
 544    bigN = hi - lo + 1;
 545    if (bigN < 2) return;
 546
 547    hp = 0;
 548    while (incs[hp] < bigN) hp++;
 549    hp--;
 550
 551    for (; hp >= 0; hp--) {
 552       h = incs[hp];
 553
 554       i = lo + h;
 555       while (True) {
 556
 557          /*-- copy 1 --*/
 558          if (i > hi) break;
 559          v = ptr[i];
 560          j = i;
 561          while ( mainGtU (
 562                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 563                  ) ) {
 564             ptr[j] = ptr[j-h];
 565             j = j - h;
 566             if (j <= (lo + h - 1)) break;
 567          }
 568          ptr[j] = v;
 569          i++;
 570
 571          /*-- copy 2 --*/
 572          if (i > hi) break;
 573          v = ptr[i];
 574          j = i;
 575          while ( mainGtU (
 576                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 577                  ) ) {
 578             ptr[j] = ptr[j-h];
 579             j = j - h;
 580             if (j <= (lo + h - 1)) break;
 581          }
 582          ptr[j] = v;
 583          i++;
 584
 585          /*-- copy 3 --*/
 586          if (i > hi) break;
 587          v = ptr[i];
 588          j = i;
 589          while ( mainGtU (
 590                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 591                  ) ) {
 592             ptr[j] = ptr[j-h];
 593             j = j - h;
 594             if (j <= (lo + h - 1)) break;
 595          }
 596          ptr[j] = v;
 597          i++;
 598
 599          if (*budget < 0) return;
 600       }
 601    }
 602 }
 603
 604
 605 /*---------------------------------------------*/
 606 /*--
 607    The following is an implementation of
 608    an elegant 3-way quicksort for strings,
 609    described in a paper "Fast Algorithms for
 610    Sorting and Searching Strings", by Robert
 611    Sedgewick and Jon L. Bentley.
 612 --*/
 613
 614 #define mswap(zz1, zz2) \
 615    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 616
 617 #define mvswap(zzp1, zzp2, zzn)       \
 618 {                                     \
 619    Int32 yyp1 = (zzp1);               \
 620    Int32 yyp2 = (zzp2);               \
 621    Int32 yyn  = (zzn);                \
 622    while (yyn > 0) {                  \
 623       mswap(ptr[yyp1], ptr[yyp2]);    \
 624       yyp1++; yyp2++; yyn--;          \
 625    }                                  \
 626 }
 627
 628 static
 629 __inline__
 630 UChar mmed3 ( UChar a, UChar b, UChar c )
 631 {
 632    UChar t;
 633    if (a > b) { t = a; a = b; b = t; };
 634    if (b > c) {
 635       b = c;
 636       if (a > b) b = a;
 637    }
 638    return b;
 639 }
 640
 641 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
 642
 643 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
 644                           stackHi[sp] = hz; \
 645                           stackD [sp] = dz; \
 646                           sp++; }
 647
 648 #define mpop(lz,hz,dz) { sp--;             \
 649                          lz = stackLo[sp]; \
 650                          hz = stackHi[sp]; \
 651                          dz = stackD [sp]; }
 652
 653
 654 #define mnextsize(az) (nextHi[az]-nextLo[az])
 655
 656 #define mnextswap(az,bz)                                        \
 657    { Int32 tz;                                                  \
 658      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 659      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 660      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
 661
 662
 663 #define MAIN_QSORT_SMALL_THRESH 20
 664 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 665 #define MAIN_QSORT_STACK_SIZE 100
 666
 667 static
 668 void mainQSort3 ( UInt32* ptr,
 669                   UChar*  block,
 670                   UInt16* quadrant,
 671                   Int32   nblock,
 672                   Int32   loSt,
 673                   Int32   hiSt,
 674                   Int32   dSt,
 675                   Int32*  budget )
 676 {
 677    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
 678    Int32 sp, lo, hi, d;
 679
 680    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
 681    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
 682    Int32 stackD [MAIN_QSORT_STACK_SIZE];
 683
 684    Int32 nextLo[3];
 685    Int32 nextHi[3];
 686    Int32 nextD [3];
 687
 688    sp = 0;
 689    mpush ( loSt, hiSt, dSt );
 690
 691    while (sp > 0) {
 692
 693       AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
 694
 695       mpop ( lo, hi, d );
 696       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
 697           d > MAIN_QSORT_DEPTH_THRESH) {
 698          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
 699          if (*budget < 0) return;
 700          continue;
 701       }
 702
 703       med = (Int32)
 704             mmed3 ( block[ptr[ lo         ]+d],
 705                     block[ptr[ hi         ]+d],
 706                     block[ptr[ (lo+hi)>>1 ]+d] );
 707
 708       unLo = ltLo = lo;
 709       unHi = gtHi = hi;
 710
 711       while (True) {
 712          while (True) {
 713             if (unLo > unHi) break;
 714             n = ((Int32)block[ptr[unLo]+d]) - med;
 715             if (n == 0) {
 716                mswap(ptr[unLo], ptr[ltLo]);
 717                ltLo++; unLo++; continue;
 718             };
 719             if (n >  0) break;
 720             unLo++;
 721          }
 722          while (True) {
 723             if (unLo > unHi) break;
 724             n = ((Int32)block[ptr[unHi]+d]) - med;
 725             if (n == 0) {
 726                mswap(ptr[unHi], ptr[gtHi]);
 727                gtHi--; unHi--; continue;
 728             };
 729             if (n <  0) break;
 730             unHi--;
 731          }
 732          if (unLo > unHi) break;
 733          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
 734       }
 735
 736       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
 737
 738       if (gtHi < ltLo) {
 739          mpush(lo, hi, d+1 );
 740          continue;
 741       }
 742
 743       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
 744       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
 745
 746       n = lo + unLo - ltLo - 1;
 747       m = hi - (gtHi - unHi) + 1;
 748
 749       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 750       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 751       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 752
 753       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 754       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
 755       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 756
 757       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
 758       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
 759
 760       mpush (nextLo[0], nextHi[0], nextD[0]);
 761       mpush (nextLo[1], nextHi[1], nextD[1]);
 762       mpush (nextLo[2], nextHi[2], nextD[2]);
 763    }
 764 }
 765
 766 #undef mswap
 767 #undef mvswap
 768 #undef mpush
 769 #undef mpop
 770 #undef mmin
 771 #undef mnextsize
 772 #undef mnextswap
 773 #undef MAIN_QSORT_SMALL_THRESH
 774 #undef MAIN_QSORT_DEPTH_THRESH
 775 #undef MAIN_QSORT_STACK_SIZE
 776
 777
 778 /*---------------------------------------------*/
 779 /* Pre:
 780       nblock > N_OVERSHOOT
 781       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 782       ((UChar*)block32) [0 .. nblock-1] holds block
 783       ptr exists for [0 .. nblock-1]
 784
 785    Post:
 786       ((UChar*)block32) [0 .. nblock-1] holds block
 787       All other areas of block32 destroyed
 788       ftab [0 .. 65536 ] destroyed
 789       ptr [0 .. nblock-1] holds sorted order
 790       if (*budget < 0), sorting was abandoned
 791 */
 792
 793 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 794 #define SETMASK (1 << 21)
 795 #define CLEARMASK (~(SETMASK))
 796
 797 static
 798 void mainSort ( UInt32* ptr,
 799                 UChar*  block,
 800                 UInt16* quadrant,
 801                 UInt32* ftab,
 802                 Int32   nblock,
 803                 Int32   verb,
 804                 Int32*  budget )
 805 {
 806    Int32  i, j, k, ss, sb;
 807    Int32  runningOrder[256];
 808    Bool   bigDone[256];
 809    Int32  copyStart[256];
 810    Int32  copyEnd  [256];
 811    UChar  c1;
 812    Int32  numQSorted;
 813    UInt16 s;
 814    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
 815
 816    /*-- set up the 2-byte frequency table --*/
 817    for (i = 65536; i >= 0; i--) ftab[i] = 0;
 818
 819    j = block[0] << 8;
 820    i = nblock-1;
 821    for (; i >= 3; i -= 4) {
 822       quadrant[i] = 0;
 823       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 824       ftab[j]++;
 825       quadrant[i-1] = 0;
 826       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
 827       ftab[j]++;
 828       quadrant[i-2] = 0;
 829       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
 830       ftab[j]++;
 831       quadrant[i-3] = 0;
 832       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
 833       ftab[j]++;
 834    }
 835    for (; i >= 0; i--) {
 836       quadrant[i] = 0;
 837       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 838       ftab[j]++;
 839    }
 840
 841    /*-- (emphasises close relationship of block & quadrant) --*/
 842    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 843       block   [nblock+i] = block[i];
 844       quadrant[nblock+i] = 0;
 845    }
 846
 847    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
 848
 849    /*-- Complete the initial radix sort --*/
 850    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
 851
 852    s = block[0] << 8;
 853    i = nblock-1;
 854    for (; i >= 3; i -= 4) {
 855       s = (s >> 8) | (block[i] << 8);
 856       j = ftab[s] -1;
 857       ftab[s] = j;
 858       ptr[j] = i;
 859       s = (s >> 8) | (block[i-1] << 8);
 860       j = ftab[s] -1;
 861       ftab[s] = j;
 862       ptr[j] = i-1;
 863       s = (s >> 8) | (block[i-2] << 8);
 864       j = ftab[s] -1;
 865       ftab[s] = j;
 866       ptr[j] = i-2;
 867       s = (s >> 8) | (block[i-3] << 8);
 868       j = ftab[s] -1;
 869       ftab[s] = j;
 870       ptr[j] = i-3;
 871    }
 872    for (; i >= 0; i--) {
 873       s = (s >> 8) | (block[i] << 8);
 874       j = ftab[s] -1;
 875       ftab[s] = j;
 876       ptr[j] = i;
 877    }
 878
 879    /*--
 880       Now ftab contains the first loc of every small bucket.
 881       Calculate the running order, from smallest to largest
 882       big bucket.
 883    --*/
 884    for (i = 0; i <= 255; i++) {
 885       bigDone     [i] = False;
 886       runningOrder[i] = i;
 887    }
 888
 889    {
 890       Int32 vv;
 891       Int32 h = 1;
 892       do h = 3 * h + 1; while (h <= 256);
 893       do {
 894          h = h / 3;
 895          for (i = h; i <= 255; i++) {
 896             vv = runningOrder[i];
 897             j = i;
 898             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
 899                runningOrder[j] = runningOrder[j-h];
 900                j = j - h;
 901                if (j <= (h - 1)) goto zero;
 902             }
 903             zero:
 904             runningOrder[j] = vv;
 905          }
 906       } while (h != 1);
 907    }
 908
 909    /*--
 910       The main sorting loop.
 911    --*/
 912
 913    numQSorted = 0;
 914
 915    for (i = 0; i <= 255; i++) {
 916
 917       /*--
 918          Process big buckets, starting with the least full.
 919          Basically this is a 3-step process in which we call
 920          mainQSort3 to sort the small buckets [ss, j], but
 921          also make a big effort to avoid the calls if we can.
 922       --*/
 923       ss = runningOrder[i];
 924
 925       /*--
 926          Step 1:
 927          Complete the big bucket [ss] by quicksorting
 928          any unsorted small buckets [ss, j], for j != ss.
 929          Hopefully previous pointer-scanning phases have already
 930          completed many of the small buckets [ss, j], so
 931          we don't have to sort them at all.
 932       --*/
 933       for (j = 0; j <= 255; j++) {
 934          if (j != ss) {
 935             sb = (ss << 8) + j;
 936             if ( ! (ftab[sb] & SETMASK) ) {
 937                Int32 lo = ftab[sb]   & CLEARMASK;
 938                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
 939                if (hi > lo) {
 940                   if (verb >= 4)
 941                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
 942                                 "done %d   this %d\n",
 943                                 ss, j, numQSorted, hi - lo + 1 );
 944                   mainQSort3 (
 945                      ptr, block, quadrant, nblock,
 946                      lo, hi, BZ_N_RADIX, budget
 947                   );
 948                   numQSorted += (hi - lo + 1);
 949                   if (*budget < 0) return;
 950                }
 951             }
 952             ftab[sb] |= SETMASK;
 953          }
 954       }
 955
 956       AssertH ( !bigDone[ss], 1006 );
 957
 958       /*--
 959          Step 2:
 960          Now scan this big bucket [ss] so as to synthesise the
 961          sorted order for small buckets [t, ss] for all t,
 962          including, magically, the bucket [ss,ss] too.
 963          This will avoid doing Real Work in subsequent Step 1's.
 964       --*/
 965       {
 966          for (j = 0; j <= 255; j++) {
 967             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 968             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 969          }
 970          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 971             k = ptr[j]-1; if (k < 0) k += nblock;
 972             c1 = block[k];
 973             if (!bigDone[c1])
 974                ptr[ copyStart[c1]++ ] = k;
 975          }
 976          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 977             k = ptr[j]-1; if (k < 0) k += nblock;
 978             c1 = block[k];
 979             if (!bigDone[c1])
 980                ptr[ copyEnd[c1]-- ] = k;
 981          }
 982       }
 983
 984       AssertH ( copyStart[ss]-1 == copyEnd[ss], 1007 );
 985
 986       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
 987
 988       /*--
 989          Step 3:
 990          The [ss] big bucket is now done.  Record this fact,
 991          and update the quadrant descriptors.  Remember to
 992          update quadrants in the overshoot area too, if
 993          necessary.  The "if (i < 255)" test merely skips
 994          this updating for the last bucket processed, since
 995          updating for the last bucket is pointless.
 996
 997          The quadrant array provides a way to incrementally
 998          cache sort orderings, as they appear, so as to
 999          make subsequent comparisons in fullGtU() complete
1000          faster.  For repetitive blocks this makes a big
1001          difference (but not big enough to be able to avoid
1002          the fallback sorting mechanism, exponential radix sort).
1003
1004          The precise meaning is: at all times:
1005
1006             for 0 <= i < nblock and 0 <= j <= nblock
1007
1008             if block[i] != block[j],
1009
1010                then the relative values of quadrant[i] and
1011                     quadrant[j] are meaningless.
1012
1013                else {
1014                   if quadrant[i] < quadrant[j]
1015                      then the string starting at i lexicographically
1016                      precedes the string starting at j
1017
1018                   else if quadrant[i] > quadrant[j]
1019                      then the string starting at j lexicographically
1020                      precedes the string starting at i
1021
1022                   else
1023                      the relative ordering of the strings starting
1024                      at i and j has not yet been determined.
1025                }
1026       --*/
1027       bigDone[ss] = True;
1028
1029       if (i < 255) {
1030          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
1031          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1032          Int32 shifts   = 0;
1033
1034          while ((bbSize >> shifts) > 65534) shifts++;
1035
1036          for (j = bbSize-1; j >= 0; j--) {
1037             Int32 a2update     = ptr[bbStart + j];
1038             UInt16 qVal        = (UInt16)(j >> shifts);
1039             quadrant[a2update] = qVal;
1040             if (a2update < BZ_N_OVERSHOOT)
1041                quadrant[a2update + nblock] = qVal;
1042          }
1043          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1044       }
1045
1046    }
1047
1048    if (verb >= 4)
1049       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1050                  nblock, numQSorted, nblock - numQSorted );
1051 }
1052
1053 #undef BIGFREQ
1054 #undef SETMASK
1055 #undef CLEARMASK
1056
1057
1058 /*---------------------------------------------*/
1059 /* Pre:
1060       nblock > 0
1061       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1062       ((UChar*)arr2)  [0 .. nblock-1] holds block
1063       arr1 exists for [0 .. nblock-1]
1064
1065    Post:
1066       ((UChar*)arr2) [0 .. nblock-1] holds block
1067       All other areas of block destroyed
1068       ftab [ 0 .. 65536 ] destroyed
1069       arr1 [0 .. nblock-1] holds sorted order
1070 */
1071 void BZ2_blockSort ( EState* s )
1072 {
1073    UInt32* ptr    = s->ptr;
1074    UChar*  block  = s->block;
1075    UInt32* ftab   = s->ftab;
1076    Int32   nblock = s->nblock;
1077    Int32   verb   = s->verbosity;
1078    Int32   wfact  = s->workFactor;
1079    UInt16* quadrant;
1080    Int32   budget;
1081    Int32   budgetInit;
1082    Int32   i;
1083
1084    if (nblock < 10000) {
1085       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1086    } else {
1087       /* Calculate the location for quadrant, remembering to get
1088          the alignment right.  Assumes that &(block[0]) is at least
1089          2-byte aligned -- this should be ok since block is really
1090          the first section of arr2.
1091       */
1092       i = nblock+BZ_N_OVERSHOOT;
1093       if (i & 1) i++;
1094       quadrant = (UInt16*)(&(block[i]));
1095
1096       /* (wfact-1) / 3 puts the default-factor-30
1097          transition point at very roughly the same place as
1098          with v0.1 and v0.9.0.
1099          Not that it particularly matters any more, since the
1100          resulting compressed stream is now the same regardless
1101          of whether or not we use the main sort or fallback sort.
1102       */
1103       if (wfact < 1  ) wfact = 1;
1104       if (wfact > 100) wfact = 100;
1105       budgetInit = nblock * ((wfact-1) / 3);
1106       budget = budgetInit;
1107
1108       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1109       if (verb >= 3)
1110          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1111                     budgetInit - budget,
1112                     nblock,
1113                     (float)(budgetInit - budget) /
1114                     (float)(nblock==0 ? 1 : nblock) );
1115       if (budget < 0) {
1116          if (verb >= 2)
1117             VPrintf0 ( "    too repetitive; using fallback"
1118                        " sorting algorithm\n" );
1119          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1120       }
1121    }
1122
1123    s->origPtr = -1;
1124    for (i = 0; i < s->nblock; i++)
1125       if (ptr[i] == 0)
1126          { s->origPtr = i; break; };
1127
1128    AssertH( s->origPtr != -1, 1003 );
1129 }
1130
1131
1132 /*-------------------------------------------------------------*/
1133 /*--- end                                       blocksort.c ---*/
1134 /*-------------------------------------------------------------*/