root/maint/gnulib/lib/diffseq.h

/* [previous][next][first][last][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. diag
  2. compareseq

   1 /* Analyze differences between two vectors.
   2 
   3    Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2021 Free Software
   4    Foundation, Inc.
   5 
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10 
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15 
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  18 
  19 
  20 /* The basic idea is to consider two vectors as similar if, when
  21    transforming the first vector into the second vector through a
  22    sequence of edits (inserts and deletes of one element each),
  23    this sequence is short - or equivalently, if the ordered list
  24    of elements that are untouched by these edits is long.  For a
  25    good introduction to the subject, read about the "Levenshtein
  26    distance" in Wikipedia.
  27 
  28    The basic algorithm is described in:
  29    "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
  30    Algorithmica Vol. 1, 1986, pp. 251-266,
  31    <https://doi.org/10.1007/BF01840446>.
  32    See especially section 4.2, which describes the variation used below.
  33 
  34    The basic algorithm was independently discovered as described in:
  35    "Algorithms for Approximate String Matching", Esko Ukkonen,
  36    Information and Control Vol. 64, 1985, pp. 100-118,
  37    <https://doi.org/10.1016/S0019-9958(85)80046-2>.
  38 
  39    Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
  40    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
  41    at the price of producing suboptimal output for large inputs with
  42    many differences.  */
  43 
  44 /* Before including this file, you need to define:
  45      ELEMENT                 The element type of the vectors being compared.
  46      EQUAL                   A two-argument macro that tests two elements for
  47                              equality.
  48      OFFSET                  A signed integer type sufficient to hold the
  49                              difference between two indices.  Usually
  50                              something like ptrdiff_t.
  51      EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
  52      NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
  53      NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
  54      NOTE_ORDERED            (Optional) A boolean expression saying that
  55                              NOTE_DELETE and NOTE_INSERT calls must be
  56                              issued in offset order.
  57      EARLY_ABORT(ctxt)       (Optional) A boolean expression that triggers an
  58                              early abort of the computation.
  59      USE_HEURISTIC           (Optional) Define if you want to support the
  60                              heuristic for large vectors.
  61 
  62    It is also possible to use this file with abstract arrays.  In this case,
  63    xvec and yvec are not represented in memory.  They only exist conceptually.
  64    In this case, the list of defines above is amended as follows:
  65      ELEMENT                 Undefined.
  66      EQUAL                   Undefined.
  67      XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
  68                              A three-argument macro: References xvec[xoff] and
  69                              yvec[yoff] and tests these elements for equality.
  70 
  71    Before including this file, you also need to include:
  72      #include <limits.h>
  73      #include <stdbool.h>
  74      #include "minmax.h"
  75  */
  76 
  77 /* Maximum value of type OFFSET.  */
  78 #define OFFSET_MAX \
  79   ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
  80 
  81 /* Default to no early abort.  */
  82 #ifndef EARLY_ABORT
  83 # define EARLY_ABORT(ctxt) false
  84 #endif
  85 
  86 #ifndef NOTE_ORDERED
  87 # define NOTE_ORDERED false
  88 #endif
  89 
  90 /* Use this to suppress gcc's "...may be used before initialized" warnings.
  91    Beware: The Code argument must not contain commas.  */
  92 #ifndef IF_LINT
  93 # if defined GCC_LINT || defined lint
  94 #  define IF_LINT(Code) Code
  95 # else
  96 #  define IF_LINT(Code) /* empty */
  97 # endif
  98 #endif
  99 
 100 /*
 101  * Context of comparison operation.
 102  */
 103 struct context
 104 {
 105   #ifdef ELEMENT
 106   /* Vectors being compared.  */
 107   ELEMENT const *xvec;
 108   ELEMENT const *yvec;
 109   #endif
 110 
 111   /* Extra fields.  */
 112   EXTRA_CONTEXT_FIELDS
 113 
 114   /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
 115      furthest along the given diagonal in the forward search of the edit
 116      matrix.  */
 117   OFFSET *fdiag;
 118 
 119   /* Vector, indexed by diagonal, containing the X coordinate of the point
 120      furthest along the given diagonal in the backward search of the edit
 121      matrix.  */
 122   OFFSET *bdiag;
 123 
 124   #ifdef USE_HEURISTIC
 125   /* This corresponds to the diff --speed-large-files flag.  With this
 126      heuristic, for vectors with a constant small density of changes,
 127      the algorithm is linear in the vector size.  */
 128   bool heuristic;
 129   #endif
 130 
 131   /* Edit scripts longer than this are too expensive to compute.  */
 132   OFFSET too_expensive;
 133 
 134   /* Snakes bigger than this are considered "big".  */
 135   #define SNAKE_LIMIT 20
 136 };
 137 
 138 struct partition
 139 {
 140   /* Midpoints of this partition.  */
 141   OFFSET xmid;
 142   OFFSET ymid;
 143 
 144   /* True if low half will be analyzed minimally.  */
 145   bool lo_minimal;
 146 
 147   /* Likewise for high half.  */
 148   bool hi_minimal;
 149 };
 150 
 151 
 152 /* Find the midpoint of the shortest edit script for a specified portion
 153    of the two vectors.
 154 
 155    Scan from the beginnings of the vectors, and simultaneously from the ends,
 156    doing a breadth-first search through the space of edit-sequence.
 157    When the two searches meet, we have found the midpoint of the shortest
 158    edit sequence.
 159 
 160    If FIND_MINIMAL is true, find the minimal edit script regardless of
 161    expense.  Otherwise, if the search is too expensive, use heuristics to
 162    stop the search and report a suboptimal answer.
 163 
 164    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
 165    XMID - YMID equals the number of inserted elements minus the number
 166    of deleted elements (counting only elements before the midpoint).
 167 
 168    Set PART->lo_minimal to true iff the minimal edit script for the
 169    left half of the partition is known; similarly for PART->hi_minimal.
 170 
 171    This function assumes that the first elements of the specified portions
 172    of the two vectors do not match, and likewise that the last elements do not
 173    match.  The caller must trim matching elements from the beginning and end
 174    of the portions it is going to specify.
 175 
 176    If we return the "wrong" partitions, the worst this can do is cause
 177    suboptimal diff output.  It cannot cause incorrect diff output.  */
 178 
 179 static void
 180 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
     /* [previous][next][first][last][top][bottom][index][help] */
 181       struct partition *part, struct context *ctxt)
 182 {
 183   OFFSET *const fd = ctxt->fdiag;       /* Give the compiler a chance. */
 184   OFFSET *const bd = ctxt->bdiag;       /* Additional help for the compiler. */
 185 #ifdef ELEMENT
 186   ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
 187   ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
 188   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
 189 #else
 190   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
 191 #endif
 192   const OFFSET dmin = xoff - ylim;      /* Minimum valid diagonal. */
 193   const OFFSET dmax = xlim - yoff;      /* Maximum valid diagonal. */
 194   const OFFSET fmid = xoff - yoff;      /* Center diagonal of top-down search. */
 195   const OFFSET bmid = xlim - ylim;      /* Center diagonal of bottom-up search. */
 196   OFFSET fmin = fmid;
 197   OFFSET fmax = fmid;           /* Limits of top-down search. */
 198   OFFSET bmin = bmid;
 199   OFFSET bmax = bmid;           /* Limits of bottom-up search. */
 200   OFFSET c;                     /* Cost. */
 201   bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
 202                                    diagonal with respect to the northwest. */
 203 
 204   fd[fmid] = xoff;
 205   bd[bmid] = xlim;
 206 
 207   for (c = 1;; ++c)
 208     {
 209       OFFSET d;                 /* Active diagonal. */
 210       bool big_snake = false;
 211 
 212       /* Extend the top-down search by an edit step in each diagonal. */
 213       if (fmin > dmin)
 214         fd[--fmin - 1] = -1;
 215       else
 216         ++fmin;
 217       if (fmax < dmax)
 218         fd[++fmax + 1] = -1;
 219       else
 220         --fmax;
 221       for (d = fmax; d >= fmin; d -= 2)
 222         {
 223           OFFSET x;
 224           OFFSET y;
 225           OFFSET tlo = fd[d - 1];
 226           OFFSET thi = fd[d + 1];
 227           OFFSET x0 = tlo < thi ? thi : tlo + 1;
 228 
 229           for (x = x0, y = x0 - d;
 230                x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
 231                x++, y++)
 232             continue;
 233           if (x - x0 > SNAKE_LIMIT)
 234             big_snake = true;
 235           fd[d] = x;
 236           if (odd && bmin <= d && d <= bmax && bd[d] <= x)
 237             {
 238               part->xmid = x;
 239               part->ymid = y;
 240               part->lo_minimal = part->hi_minimal = true;
 241               return;
 242             }
 243         }
 244 
 245       /* Similarly extend the bottom-up search.  */
 246       if (bmin > dmin)
 247         bd[--bmin - 1] = OFFSET_MAX;
 248       else
 249         ++bmin;
 250       if (bmax < dmax)
 251         bd[++bmax + 1] = OFFSET_MAX;
 252       else
 253         --bmax;
 254       for (d = bmax; d >= bmin; d -= 2)
 255         {
 256           OFFSET x;
 257           OFFSET y;
 258           OFFSET tlo = bd[d - 1];
 259           OFFSET thi = bd[d + 1];
 260           OFFSET x0 = tlo < thi ? tlo : thi - 1;
 261 
 262           for (x = x0, y = x0 - d;
 263                xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
 264                x--, y--)
 265             continue;
 266           if (x0 - x > SNAKE_LIMIT)
 267             big_snake = true;
 268           bd[d] = x;
 269           if (!odd && fmin <= d && d <= fmax && x <= fd[d])
 270             {
 271               part->xmid = x;
 272               part->ymid = y;
 273               part->lo_minimal = part->hi_minimal = true;
 274               return;
 275             }
 276         }
 277 
 278       if (find_minimal)
 279         continue;
 280 
 281 #ifdef USE_HEURISTIC
 282       bool heuristic = ctxt->heuristic;
 283 #else
 284       bool heuristic = false;
 285 #endif
 286 
 287       /* Heuristic: check occasionally for a diagonal that has made lots
 288          of progress compared with the edit distance.  If we have any
 289          such, find the one that has made the most progress and return it
 290          as if it had succeeded.
 291 
 292          With this heuristic, for vectors with a constant small density
 293          of changes, the algorithm is linear in the vector size.  */
 294 
 295       if (200 < c && big_snake && heuristic)
 296         {
 297           {
 298             OFFSET best = 0;
 299 
 300             for (d = fmax; d >= fmin; d -= 2)
 301               {
 302                 OFFSET dd = d - fmid;
 303                 OFFSET x = fd[d];
 304                 OFFSET y = x - d;
 305                 OFFSET v = (x - xoff) * 2 - dd;
 306 
 307                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 308                   {
 309                     if (v > best
 310                         && xoff + SNAKE_LIMIT <= x && x < xlim
 311                         && yoff + SNAKE_LIMIT <= y && y < ylim)
 312                       {
 313                         /* We have a good enough best diagonal; now insist
 314                            that it end with a significant snake.  */
 315                         int k;
 316 
 317                         for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
 318                           if (k == SNAKE_LIMIT)
 319                             {
 320                               best = v;
 321                               part->xmid = x;
 322                               part->ymid = y;
 323                               break;
 324                             }
 325                       }
 326                   }
 327               }
 328             if (best > 0)
 329               {
 330                 part->lo_minimal = true;
 331                 part->hi_minimal = false;
 332                 return;
 333               }
 334           }
 335 
 336           {
 337             OFFSET best = 0;
 338 
 339             for (d = bmax; d >= bmin; d -= 2)
 340               {
 341                 OFFSET dd = d - bmid;
 342                 OFFSET x = bd[d];
 343                 OFFSET y = x - d;
 344                 OFFSET v = (xlim - x) * 2 + dd;
 345 
 346                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 347                   {
 348                     if (v > best
 349                         && xoff < x && x <= xlim - SNAKE_LIMIT
 350                         && yoff < y && y <= ylim - SNAKE_LIMIT)
 351                       {
 352                         /* We have a good enough best diagonal; now insist
 353                            that it end with a significant snake.  */
 354                         int k;
 355 
 356                         for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
 357                           if (k == SNAKE_LIMIT - 1)
 358                             {
 359                               best = v;
 360                               part->xmid = x;
 361                               part->ymid = y;
 362                               break;
 363                             }
 364                       }
 365                   }
 366               }
 367             if (best > 0)
 368               {
 369                 part->lo_minimal = false;
 370                 part->hi_minimal = true;
 371                 return;
 372               }
 373           }
 374         }
 375 
 376       /* Heuristic: if we've gone well beyond the call of duty, give up
 377          and report halfway between our best results so far.  */
 378       if (c >= ctxt->too_expensive)
 379         {
 380           OFFSET fxybest;
 381           OFFSET fxbest IF_LINT (= 0);
 382           OFFSET bxybest;
 383           OFFSET bxbest IF_LINT (= 0);
 384 
 385           /* Find forward diagonal that maximizes X + Y.  */
 386           fxybest = -1;
 387           for (d = fmax; d >= fmin; d -= 2)
 388             {
 389               OFFSET x = MIN (fd[d], xlim);
 390               OFFSET y = x - d;
 391               if (ylim < y)
 392                 {
 393                   x = ylim + d;
 394                   y = ylim;
 395                 }
 396               if (fxybest < x + y)
 397                 {
 398                   fxybest = x + y;
 399                   fxbest = x;
 400                 }
 401             }
 402 
 403           /* Find backward diagonal that minimizes X + Y.  */
 404           bxybest = OFFSET_MAX;
 405           for (d = bmax; d >= bmin; d -= 2)
 406             {
 407               OFFSET x = MAX (xoff, bd[d]);
 408               OFFSET y = x - d;
 409               if (y < yoff)
 410                 {
 411                   x = yoff + d;
 412                   y = yoff;
 413                 }
 414               if (x + y < bxybest)
 415                 {
 416                   bxybest = x + y;
 417                   bxbest = x;
 418                 }
 419             }
 420 
 421           /* Use the better of the two diagonals.  */
 422           if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
 423             {
 424               part->xmid = fxbest;
 425               part->ymid = fxybest - fxbest;
 426               part->lo_minimal = true;
 427               part->hi_minimal = false;
 428             }
 429           else
 430             {
 431               part->xmid = bxbest;
 432               part->ymid = bxybest - bxbest;
 433               part->lo_minimal = false;
 434               part->hi_minimal = true;
 435             }
 436           return;
 437         }
 438     }
 439   #undef XREF_YREF_EQUAL
 440 }
 441 
 442 
 443 /* Compare in detail contiguous subsequences of the two vectors
 444    which are known, as a whole, to match each other.
 445 
 446    The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
 447 
 448    Note that XLIM, YLIM are exclusive bounds.  All indices into the vectors
 449    are origin-0.
 450 
 451    If FIND_MINIMAL, find a minimal difference no matter how
 452    expensive it is.
 453 
 454    The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
 455 
 456    Return false if terminated normally, or true if terminated through early
 457    abort.  */
 458 
 459 static bool
 460 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
     /* [previous][next][first][last][top][bottom][index][help] */
 461             bool find_minimal, struct context *ctxt)
 462 {
 463 #ifdef ELEMENT
 464   ELEMENT const *xv = ctxt->xvec; /* Help the compiler.  */
 465   ELEMENT const *yv = ctxt->yvec;
 466   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
 467 #else
 468   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
 469 #endif
 470 
 471   while (true)
 472     {
 473       /* Slide down the bottom initial diagonal.  */
 474       while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
 475         {
 476           xoff++;
 477           yoff++;
 478         }
 479 
 480       /* Slide up the top initial diagonal. */
 481       while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
 482         {
 483           xlim--;
 484           ylim--;
 485         }
 486 
 487       /* Handle simple cases. */
 488       if (xoff == xlim)
 489         {
 490           while (yoff < ylim)
 491             {
 492               NOTE_INSERT (ctxt, yoff);
 493               if (EARLY_ABORT (ctxt))
 494                 return true;
 495               yoff++;
 496             }
 497           break;
 498         }
 499       if (yoff == ylim)
 500         {
 501           while (xoff < xlim)
 502             {
 503               NOTE_DELETE (ctxt, xoff);
 504               if (EARLY_ABORT (ctxt))
 505                 return true;
 506               xoff++;
 507             }
 508           break;
 509         }
 510 
 511       struct partition part;
 512 
 513       /* Find a point of correspondence in the middle of the vectors.  */
 514       diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
 515 
 516       /* Use the partitions to split this problem into subproblems.  */
 517       OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
 518       bool find_minimal1, find_minimal2;
 519       if (!NOTE_ORDERED
 520           && ((xlim + ylim) - (part.xmid + part.ymid)
 521               < (part.xmid + part.ymid) - (xoff + yoff)))
 522         {
 523           /* The second problem is smaller and the caller doesn't
 524              care about order, so do the second problem first to
 525              lessen recursion.  */
 526           xoff1 = part.xmid; xlim1 = xlim;
 527           yoff1 = part.ymid; ylim1 = ylim;
 528           find_minimal1 = part.hi_minimal;
 529 
 530           xoff2 = xoff; xlim2 = part.xmid;
 531           yoff2 = yoff; ylim2 = part.ymid;
 532           find_minimal2 = part.lo_minimal;
 533         }
 534       else
 535         {
 536           xoff1 = xoff; xlim1 = part.xmid;
 537           yoff1 = yoff; ylim1 = part.ymid;
 538           find_minimal1 = part.lo_minimal;
 539 
 540           xoff2 = part.xmid; xlim2 = xlim;
 541           yoff2 = part.ymid; ylim2 = ylim;
 542           find_minimal2 = part.hi_minimal;
 543         }
 544 
 545       /* Recurse to do one subproblem.  */
 546       bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
 547       if (early)
 548         return early;
 549 
 550       /* Iterate to do the other subproblem.  */
 551       xoff = xoff2; xlim = xlim2;
 552       yoff = yoff2; ylim = ylim2;
 553       find_minimal = find_minimal2;
 554     }
 555 
 556   return false;
 557   #undef XREF_YREF_EQUAL
 558 }
 559 
 560 #undef ELEMENT
 561 #undef EQUAL
 562 #undef OFFSET
 563 #undef EXTRA_CONTEXT_FIELDS
 564 #undef NOTE_DELETE
 565 #undef NOTE_INSERT
 566 #undef EARLY_ABORT
 567 #undef USE_HEURISTIC
 568 #undef XVECREF_YVECREF_EQUAL
 569 #undef OFFSET_MAX

/* [previous][next][first][last][top][bottom][index][help] */