Commit d1a87961 authored by Joey Adams's avatar Joey Adams Committed by Rusty Russell

bdelta: new module for binary diff/patch

parent aa6b7489
../../licenses/BSD-MIT
\ No newline at end of file
#include <string.h>
#include "config.h"
/**
* bdelta - Generate and apply binary deltas
*
* This library takes two strings containing binary data, and produces a
* "patch" that can be applied to the first one to produce the second one.
* It can be used to save bandwidth and disk space when many texts differing
* by a small number of bytes need to be transmitted or stored.
*
* Patches produced by this version of the library can be applied using future
* versions, but not past versions.
*
* bdelta implements the algorithm described in
* An O(ND) Difference Algorithm and Its Variations by Eugene W. Myers.
* Because its memory usage and expected running time are O(N + D^2),
* it works well only when the strings differ by a small number of bytes.
* This implementation stops trying when the strings differ by more than
* 1000 bytes, and falls back to producing a patch that simply emits the new
* string.
*
* Thus, bdelta does not save any space when given two strings that differ by
* more than 1000 bytes. This may be improved in a future version of the
* library.
*
* Example:
* #include <ccan/bdelta/bdelta.h>
* #include <stdio.h>
* #include <stdlib.h>
* #include <string.h>
*
* static void gulp(const char *filename, void **data_out, size_t *size_out);
*
* static int usage(const char *prog)
* {
* fprintf(
* stderr,
* "Usage: %s diff <old> <new> > <patch>\n"
* " %s patch <old> <patch> > <new>\n",
* prog, prog
* );
* return 1;
* }
*
* int main(int argc, char *argv[])
* {
* void *old, *new_, *patch;
* size_t old_size, new_size, patch_size;
* BDELTAcode rc;
*
* if (argc != 4)
* return usage(argv[0]);
*
* if (strcmp(argv[1], "diff") == 0) {
* gulp(argv[2], &old, &old_size);
* gulp(argv[3], &new_, &new_size);
*
* rc = bdelta_diff(old, old_size, new_, new_size, &patch, &patch_size);
* if (rc != BDELTA_OK) {
* bdelta_perror("bdelta_diff", rc);
* return 1;
* }
*
* if (fwrite(patch, 1, patch_size, stdout) != patch_size) {
* perror("stdout");
* return 1;
* }
* } else if (strcmp(argv[1], "patch") == 0) {
* gulp(argv[2], &old, &old_size);
* gulp(argv[3], &patch, &patch_size);
*
* rc = bdelta_patch(old, old_size, patch, patch_size, &new_, &new_size);
* if (rc != BDELTA_OK) {
* bdelta_perror("bdelta_patch", rc);
* return 1;
* }
*
* if (fwrite(new_, 1, new_size, stdout) != new_size) {
* perror("stdout");
* return 1;
* }
* } else {
* return usage(argv[0]);
* }
*
* free(old);
* free(new_);
* free(patch);
* return 0;
* }
*
* static void gulp(const char *filename, void **data_out, size_t *size_out)
* {
* FILE *f = fopen(filename, "rb");
* size_t size = 0;
* size_t alloc = 16;
* char *data = malloc(alloc);
*
* if (f == NULL || data == NULL)
* goto error;
*
* for (;;) {
* size += fread(data + size, 1, alloc - size, f);
* if (size < alloc) {
* if (!feof(f))
* goto error;
* break;
* }
* data = realloc(data, alloc *= 2);
* if (data == NULL)
* goto error;
* }
*
* if (fclose(f) != 0)
* goto error;
*
* *data_out = data;
* *size_out = size;
* return;
*
* error:
* perror(filename);
* exit(EXIT_FAILURE);
* }
*
* Author: Joey Adams <joeyadams3.14159@gmail.com>
* License: MIT
* Version: 0.1.1
*/
int main(int argc, char *argv[])
{
/* Expect exactly one argument */
if (argc != 2)
return 1;
if (strcmp(argv[1], "depends") == 0) {
/* Nothing */
return 0;
}
return 1;
}
/*
* Copyright (C) 2011 Joseph Adams <joeyadams3.14159@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "bdelta.h"
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct
{
unsigned char *cur; /* End of string; insertion point for new bytes */
unsigned char *end; /* End of buffer */
unsigned char *start; /* Beginning of string */
} SB;
/* sb is evaluated multiple times in these macros. */
#define sb_size(sb) ((size_t)((sb)->cur - (sb)->start))
#define sb_avail(sb) ((size_t)((sb)->end - (sb)->cur))
/* sb and need may be evaluated multiple times. */
#define sb_need(sb, need) do { \
if (sb_avail(sb) < (need)) \
if (sb_grow(sb, need) != 0) \
goto out_of_memory; \
} while (0)
static int sb_init(SB *sb)
{
sb->start = malloc(17);
if (sb->start == NULL)
return -1;
sb->cur = sb->start;
sb->end = sb->start + 16;
return 0;
}
static int sb_grow(SB *sb, size_t need)
{
size_t length = sb->cur - sb->start;
size_t alloc = sb->end - sb->start;
unsigned char *tmp;
do {
alloc *= 2;
} while (alloc < length + need);
tmp = realloc(sb->start, alloc + 1);
if (tmp == NULL)
return -1;
sb->start = tmp;
sb->cur = tmp + length;
sb->end = tmp + alloc;
return 0;
}
static int sb_putc(SB *sb, unsigned char c)
{
sb_need(sb, 1);
*sb->cur++ = c;
return 0;
out_of_memory:
return -1;
}
static int sb_write(SB *sb, const void *data, size_t size)
{
sb_need(sb, size);
memcpy(sb->cur, data, size);
sb->cur += size;
return 0;
out_of_memory:
return -1;
}
static void sb_return(SB *sb, void **data_out, size_t *length_out)
{
*sb->cur = 0;
if (data_out)
*data_out = sb->start;
else
free(sb->start);
if (length_out)
*length_out = sb->cur - sb->start;
}
static void sb_discard(SB *sb, void **data_out, size_t *length_out)
{
free(sb->start);
if (data_out)
*data_out = NULL;
if (length_out)
*length_out = 0;
}
/*
* The first byte in a patch is the "patch type", which indicates how the
* patch is formatted. This keeps the patch format flexible while retaining
* backward compatibility. Patches produced with an older version of
* the library can be applied with a newer version.
*
* PT_LITERAL
* Contains nothing more than the content of the new text.
*
* PT_CSI32
* A string of copy, skip, and insert instructions for generating the new
* string from the old.
*
* copy(size): Copy @size bytes from old to new.
* skip(size): Skip @size bytes of old.
* insert(text): Insert @text into new.
*
* The syntax is as follows:
*
* copy: instruction_byte(1) size
* skip: instruction_byte(2) size
* insert: instruction_byte(3) size $size*OCTET
*
* 0 <= size_param_length <= 4
* instruction_byte(op) = op | size_param_length << 2
* size: $size_param_length*OCTET
* -- size is an unsigned integer encoded in big endian.
* -- However, if size_param_length is 0, the operation size is 1.
*
* Simply put, an instruction starts with an opcode and operation size.
* An insert instruction is followed by the bytes to be inserted.
*/
#define PT_LITERAL 10
#define PT_CSI32 11
#define OP_COPY 1
#define OP_SKIP 2
#define OP_INSERT 3
static unsigned int bytes_needed_for_size(uint32_t size)
{
if (size == 1)
return 0;
else if (size <= 0xFF)
return 1;
else if (size <= 0xFFFF)
return 2;
else if (size <= 0xFFFFFF)
return 3;
else
return 4;
}
/*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_MEMORY: Memory allocation failed
*/
static BDELTAcode csi32_emit_op(SB *patch_out, int op, uint32_t size, const char **new_)
{
unsigned int i;
unsigned int size_param_length;
size_t need;
uint32_t tmp;
assert(op >= 1 && op <= 3);
if (size == 0)
return BDELTA_OK;
size_param_length = bytes_needed_for_size(size);
need = 1 + size_param_length;
if (op == OP_INSERT)
need += size;
sb_need(patch_out, need);
*patch_out->cur++ = (unsigned int)op | size_param_length << 2;
for (i = size_param_length, tmp = size; i-- > 0; tmp >>= 8)
patch_out->cur[i] = tmp & 0xFF;
patch_out->cur += size_param_length;
switch (op) {
case OP_COPY:
*new_ += size;
break;
case OP_SKIP:
break;
case OP_INSERT:
memcpy(patch_out->cur, *new_, size);
patch_out->cur += size;
*new_ += size;
break;
default:
assert(0);
}
return BDELTA_OK;
out_of_memory:
return BDELTA_MEMORY;
}
/*
* On success, returns 1, advances *sp past the parsed text, and sets *op_out and *size_out.
* On error or EOF, returns 0.
*/
static int csi32_parse_op(
const unsigned char **sp, const unsigned char *e,
int *op_out, uint32_t *size_out)
{
const unsigned char *s = *sp;
int op;
unsigned int i;
unsigned int size_param_length;
uint32_t size;
if (s >= e)
return 0;
op = *s & 3;
size_param_length = *s >> 2;
s++;
if (op == 0 || size_param_length > 4)
return 0;
if (size_param_length == 0) {
size = 1;
} else {
if ((size_t)(e - s) < size_param_length)
return 0;
size = 0;
for (i = 0; i < size_param_length; i++) {
size <<= 8;
size |= *s++ & 0xFF;
}
}
/* Make sure insert data fits in the patch, but don't consume it. */
if (op == OP_INSERT && (size_t)(e - s) < size)
return 0;
*op_out = op;
*size_out = size;
*sp = s;
return 1;
}
/*
* bdelta uses the algorithm described in:
*
* Myers, E. (1986). An O(ND) Difference Algorithm and Its Variations.
* Retrieved from http://www.xmailserver.org/diff2.pdf
*
* The pseudocode in Myers' paper (Figure 2) uses an array called V,
* where (V[k], V[k] - k) is the endpoint of the furthest-reaching
* D-path ending on diagonal k.
*
* The structure below holds the V array for every iteration of the outer loop.
* Because each iteration produces D+1 values, a triangle is formed:
*
* k
* -5 -4 -3 -2 -1 0 1 2 3 4 5
* ----------------------------------
* 0 | 0 (copy 0)
* | \ skip 1
* 1 | 0 1
* | \ skip 1, then copy 1
* 2 | 2 2 3
* D | / insert 1, then copy 2
* 3 | 3 4 5 5
* | \ skip 1, then copy 1
* 4 | 3 4 5 7 7
* | / insert 1
* 5 | 3 4 5 7 - -
*
* @data will literally contain: 0 0 1 2 2 3 3 4 5 5 3 4 5 7 7 3 4 5 7
*
* To convert this to an edit script, we first climb back to the top,
* using the same procedure as was used when the triangle was generated:
*
* If k = -D, climb right (the only way we can go).
* If k = +D, climb left (the only way we can go).
* Otherwise, favor the greater number.
* If the numbers are the same, climb left.
*
* Finally, we convert the descent to the solution to a patch script:
*
* The top number n corresponds to:
* copy n
*
* A descent left from a to b corresponds to:
* insert 1
* copy b-a
*
* A descent right from a to b corresponds to:
* skip 1
* copy b-a-1
*/
typedef struct
{
uint32_t *data;
int solution_d;
int solution_k;
uint32_t *solution_ptr;
} Triangle;
/*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_MEMORY: Memory allocation failed
* BDELTA_INTERNAL_DMAX_EXCEEDED: d_max exceeded (strings are too different)
*/
static BDELTAcode build_triangle(
const char *old, uint32_t old_size,
const char *new_, uint32_t new_size,
int d_max,
Triangle *triangle_out)
{
int d, k;
uint32_t x, y;
uint32_t *data;
uint32_t *vprev; /* position within previous row */
uint32_t *v; /* position within current row */
uint32_t *vcur; /* beginning of current row */
size_t data_alloc = 16;
memset(triangle_out, 0, sizeof(*triangle_out));
data = malloc(data_alloc * sizeof(*data));
if (data == NULL)
return BDELTA_MEMORY;
/* Allow dmax < 0 to mean "no limit". */
if (d_max < 0)
d_max = old_size + new_size;
/*
* Compute the farthest-reaching 0-path so the loop after this
* will have a "previous" row to start with.
*/
for (x = 0; x < old_size && x < new_size && old[x] == new_[x]; )
x++;
*data = x;
if (x >= old_size && x >= new_size) {
/* Strings are equal, so return a triangle with one row (a dot). */
assert(x == old_size && x == new_size);
triangle_out->data = data;
triangle_out->solution_d = 0;
triangle_out->solution_k = 0;
triangle_out->solution_ptr = data;
return BDELTA_OK;
}
vprev = data;
vcur = v = data + 1;
/*
* Here is the core of the Myers diff algorithm.
*
* This is a direct translation of the pseudocode in Myers' paper,
* with implementation-specific adaptations:
*
* * Every V array is preserved per iteration of the outer loop.
* This is necessary so we can determine the actual patch, not just
* the length of the shortest edit string. See the coment above
* the definition of Triangle for an in-depth explanation.
*
* * Array items are stored consecutively so as to not waste space.
*
* * The buffer holding the V arrays is expanded dynamically.
*/
for (d = 1; d <= d_max; d++, vprev = vcur, vcur = v) {
/* Ensure that the buffer has enough room for this row. */
if ((size_t)(v - data + d + 1) > data_alloc) {
size_t vprev_idx = vprev - data;
size_t v_idx = v - data;
size_t vcur_idx = vcur - data;
uint32_t *tmp;
do {
data_alloc *= 2;
} while ((size_t)(v - data + d + 1) > data_alloc);
tmp = realloc(data, data_alloc * sizeof(*data));
if (tmp == NULL) {
free(data);
return BDELTA_MEMORY;
}
data = tmp;
/* Relocate pointers to the buffer we just expanded. */
vprev = data + vprev_idx;
v = data + v_idx;
vcur = data + vcur_idx;
}
for (k = -d; k <= d; k += 2, vprev++) {
if (k == -d || (k != d && vprev[-1] < vprev[0]))
x = vprev[0];
else
x = vprev[-1] + 1;
y = x - k;
while (x < old_size && y < new_size && old[x] == new_[y])
x++, y++;
*v++ = x;
if (x >= old_size && y >= new_size) {
/* Shortest edit string found. */
assert(x == old_size && y == new_size);
triangle_out->data = data;
triangle_out->solution_d = d;
triangle_out->solution_k = k;
triangle_out->solution_ptr = v - 1;
return BDELTA_OK;
}
}
}
free(data);
return BDELTA_INTERNAL_DMAX_EXCEEDED;
}
/*
* Trace a solution back to the top, returning a string of instructions
* for descending from the top to the solution.
*
* An instruction is one of the following:
*
* -1: Descend left.
* +1: Descend right.
* 0: Finished. You should be at the solution now.
*
* If memory allocation fails, this function will return NULL.
*/
static signed char *climb_triangle(const Triangle *triangle)
{
signed char *descent;
int d, k;
uint32_t *p;
assert(triangle->solution_d >= 0);
descent = malloc(triangle->solution_d + 1);
if (descent == NULL)
return NULL;
d = triangle->solution_d;
k = triangle->solution_k;
p = triangle->solution_ptr;
descent[d] = 0;
while (d > 0) {
if (k == -d || (k != d && *(p-d-1) < *(p-d))) {
/* Climb right */
k++;
p = p - d;
descent[--d] = -1;
} else {
/* Climb left */
k--;
p = p - d - 1;
descent[--d] = 1;
}
}
return descent;
}
/*
* Generate the actual patch, given data produced by build_triangle and
* climb_triangle. new_ is needed for the content of the insertions.
*
* See the comment above the definition of Triangle. It concisely documents
* how a descent down the triangle corresponds to a patch script.
*
* The resulting patch, including the patch type byte, is appended to patch_out.
*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_MEMORY: Memory allocation failed
*/
static BDELTAcode descent_to_patch(
const signed char *descent,
const Triangle *triangle,
const char *new_, uint32_t new_size,
SB *patch_out)
{
const char *new_end = new_ + new_size;
uint32_t *p = triangle->data;
uint32_t *p2;
int d = 0;
int k = 0;
int pending_op = 0;
int current_op;
uint32_t pending_length = 0;
uint32_t copy_length;
if (sb_putc(patch_out, PT_CSI32) != 0)
return BDELTA_MEMORY;
if (*p > 0) {
if (csi32_emit_op(patch_out, OP_COPY, *p, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
}
for (; *descent != 0; descent++, p = p2) {
if (*descent < 0) {
/* Descend left. */
d++;
k--;
p2 = p + d;
current_op = OP_INSERT;
assert(*p2 >= *p);
copy_length = *p2 - *p;
} else {
/* Descend right. */
d++;
k++;
p2 = p + d + 1;
current_op = OP_SKIP;
assert(*p2 > *p);
copy_length = *p2 - *p - 1;
}
if (pending_op == current_op) {
pending_length++;
} else {
if (pending_op != 0) {
if (csi32_emit_op(patch_out, pending_op, pending_length, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
}
pending_op = current_op;
pending_length = 1;
}
if (copy_length > 0) {
if (csi32_emit_op(patch_out, pending_op, pending_length, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
pending_op = 0;
if (csi32_emit_op(patch_out, OP_COPY, copy_length, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
}
}
assert(d == triangle->solution_d);
assert(k == triangle->solution_k);
assert(p == triangle->solution_ptr);
/* Emit the last pending op, unless it's a skip. */
if (pending_op != 0 && pending_op != OP_SKIP) {
if (csi32_emit_op(patch_out, pending_op, pending_length, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
}
assert(new_ == new_end);
return BDELTA_OK;
}
/*
* Generate a patch using Myers' O(ND) algorithm.
*
* The patch is appended to @patch_out, which must be initialized before calling.
*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_MEMORY: Memory allocation failed
* BDELTA_INTERNAL_INPUTS_TOO_LARGE: Input sizes are too large
* BDELTA_INTERNAL_DMAX_EXCEEDED: d_max exceeded (strings are too different)
*/
static BDELTAcode diff_myers(
const char *old, size_t old_size,
const char *new_, size_t new_size,
SB *patch_out)
{
Triangle triangle;
signed char *descent;
BDELTAcode rc;
/* Make sure old_size + new_size does not overflow int or uint32_t. */
if (old_size >= UINT32_MAX ||
new_size >= UINT32_MAX - old_size ||
old_size >= (unsigned int)INT_MAX ||
new_size >= (unsigned int)INT_MAX - old_size)
return BDELTA_INTERNAL_INPUTS_TOO_LARGE;
rc = build_triangle(old, old_size, new_, new_size, 1000, &triangle);
if (rc != BDELTA_OK)
return rc;
descent = climb_triangle(&triangle);
if (descent == NULL)
goto oom1;
if (descent_to_patch(descent, &triangle, new_, new_size, patch_out) != BDELTA_OK)
goto oom2;
free(descent);
free(triangle.data);
return BDELTA_OK;
oom2:
free(descent);
oom1:
free(triangle.data);
return BDELTA_MEMORY;
}
BDELTAcode bdelta_diff(
const void *old, size_t old_size,
const void *new_, size_t new_size,
void **patch_out, size_t *patch_size_out)
{
SB patch;
if (sb_init(&patch) != 0)
goto out_of_memory;
if (new_size == 0)
goto emit_new_literally;
if (diff_myers(old, old_size, new_, new_size, &patch) != BDELTA_OK)
goto emit_new_literally;
if (sb_size(&patch) > new_size) {
/*
* A literal copy of new is no longer than this patch.
* All that for nothing.
*/
goto emit_new_literally;
}
/*
* Verify that patch, when applied to old, produces the correct text.
* If it doesn't, it's a bug, but fall back to a simple emit
* to avert data corruption.
*/
{
void *result;
size_t result_size;
BDELTAcode rc;
int correct;
rc = bdelta_patch(
old, old_size,
patch.start, patch.cur - patch.start,
&result, &result_size
);
switch (rc) {
case BDELTA_OK:
correct = (result_size == new_size &&
memcmp(result, new_, new_size) == 0);
free(result);
break;
case BDELTA_MEMORY:
goto out_of_memory;
default:
correct = 0;
break;
}
if (!correct) {
assert(0);
goto emit_new_literally;
}
}
sb_return(&patch, patch_out, patch_size_out);
return BDELTA_OK;
emit_new_literally:
if (patch.cur != patch.start) {
free(patch.start);
if (sb_init(&patch) != 0)
goto out_of_memory;
}
if (sb_putc(&patch, PT_LITERAL) != 0 || sb_write(&patch, new_, new_size) != 0)
goto out_of_memory;
sb_return(&patch, patch_out, patch_size_out);
return BDELTA_OK;
out_of_memory:
sb_discard(&patch, patch_out, patch_size_out);
return BDELTA_MEMORY;
}
/*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_PATCH_INVALID: Patch is malformed
* BDELTA_PATCH_MISMATCH: Old string is too small
* BDELTA_MEMORY: Memory allocation failed
*/
static BDELTAcode patch_csi32(
const unsigned char *o, const unsigned char *oe,
const unsigned char *p, const unsigned char *pe,
SB *new_out)
{
int op;
uint32_t size;
while (csi32_parse_op(&p, pe, &op, &size)) {
if ((op == OP_COPY || op == OP_SKIP) && (size_t)(oe - o) < size) {
/* Copy or skip instruction exceeds length of old string. */
return BDELTA_PATCH_MISMATCH;
}
if (op == OP_COPY || op == OP_INSERT)
sb_need(new_out, size);
switch (op) {
case OP_COPY: /* Copy @size bytes from old string. */
memcpy(new_out->cur, o, size);
new_out->cur += size;
o += size;
break;
case OP_SKIP: /* Skip @size bytes of old string. */
o += size;
break;
case OP_INSERT: /* Insert @size new bytes (from the patch script). */
memcpy(new_out->cur, p, size);
new_out->cur += size;
p += size;
break;
default:
assert(0);
}
}
if (p != pe)
return BDELTA_PATCH_INVALID;
return BDELTA_OK;
out_of_memory:
return BDELTA_MEMORY;
}
BDELTAcode bdelta_patch(
const void *old, size_t old_size,
const void *patch, size_t patch_size,
void **new_out, size_t *new_size_out)
{
const unsigned char *o = old;
const unsigned char *oe = o + old_size;
const unsigned char *p = patch;
const unsigned char *pe = p + patch_size;
SB result;
BDELTAcode rc;
if (sb_init(&result) != 0) {
rc = BDELTA_MEMORY;
goto discard;
}
if (p >= pe) {
rc = BDELTA_PATCH_INVALID;
goto discard;
}
switch (*p++) {
case PT_LITERAL:
if (sb_write(&result, p, pe - p) != 0) {
rc = BDELTA_MEMORY;
goto discard;
}
break;
case PT_CSI32:
rc = patch_csi32(o, oe, p, pe, &result);
if (rc != BDELTA_OK)
goto discard;
break;
default:
rc = BDELTA_PATCH_INVALID;
goto discard;
}
sb_return(&result, new_out, new_size_out);
return BDELTA_OK;
discard:
sb_discard(&result, new_out, new_size_out);
return rc;
}
const char *bdelta_strerror(BDELTAcode code)
{
switch (code) {
case BDELTA_OK:
return "Success";
case BDELTA_MEMORY:
return "Could not allocate memory";
case BDELTA_PATCH_INVALID:
return "Patch is invalid";
case BDELTA_PATCH_MISMATCH:
return "Patch applied to wrong data";
case BDELTA_INTERNAL_DMAX_EXCEEDED:
return "Difference threshold exceeded (internal error)";
case BDELTA_INTERNAL_INPUTS_TOO_LARGE:
return "Inputs are too large (internal error)";
default:
return "Invalid error code";
}
}
void bdelta_perror(const char *s, BDELTAcode code)
{
if (s != NULL && *s != '\0')
fprintf(stderr, "%s: %s\n", s, bdelta_strerror(code));
else
fprintf(stderr, "%s\n", bdelta_strerror(code));
}
/*
* Copyright (C) 2011 Joseph Adams <joeyadams3.14159@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef CCAN_BDELTA_H
#define CCAN_BDELTA_H
#include <stddef.h>
typedef enum {
BDELTA_OK = 0, /* Operation succeeded. */
BDELTA_MEMORY = 1, /* Memory allocation failed. */
BDELTA_PATCH_INVALID = 2, /* Patch is malformed. */
BDELTA_PATCH_MISMATCH = 3, /* Patch applied to wrong original string. */
/* Internal error codes. These will never be returned by API functions. */
BDELTA_INTERNAL_DMAX_EXCEEDED = -10,
BDELTA_INTERNAL_INPUTS_TOO_LARGE = -11,
} BDELTAcode;
/*
* bdelta_diff - Given two byte strings, generate a "patch" (also a byte string)
* that describes how to transform the old string into the new string.
*
* On success, returns BDELTA_OK, and passes a malloc'd block
* and its size through *patch_out and *patch_size_out.
*
* On failure, returns an error code, and clears *patch_out and *patch_size_out.
*
* Example:
* const char *old = "abcabba";
* const char *new_ = "cbabac";
* void *patch;
* size_t patch_size;
* BDELTAcode rc;
*
* rc = bdelta_diff(old, strlen(old), new_, strlen(new_), &patch, &patch_size);
* if (rc != BDELTA_OK) {
* bdelta_perror("bdelta_diff", rc);
* return;
* }
* ...
* free(patch);
*/
BDELTAcode bdelta_diff(
const void *old, size_t old_size,
const void *new_, size_t new_size,
void **patch_out, size_t *patch_size_out
);
/*
* bdelta_patch - Apply a patch produced by bdelta_diff to the
* old string to recover the new string.
*
* On success, returns BDELTA_OK, and passes a malloc'd block
* and its size through *new_out and *new_size_out.
*
* On failure, returns an error code, and clears *new_out and *new_size_out.
*
* Example:
* const char *old = "abcabba";
* void *new_;
* size_t new_size;
* BDELTAcode rc;
*
* rc = bdelta_patch(old, strlen(old), patch, patch_size, &new_, &new_size);
* if (rc != BDELTA_OK) {
* bdelta_perror("bdelta_patch", rc);
* return;
* }
* fwrite(new_, 1, new_size, stdout);
* putchar('\n');
* free(new_);
*/
BDELTAcode bdelta_patch(
const void *old, size_t old_size,
const void *patch, size_t patch_size,
void **new_out, size_t *new_size_out
);
/*
* bdelta_strerror - Return a string describing a bdelta error code.
*/
const char *bdelta_strerror(BDELTAcode code);
/*
* bdelta_perror - Print a bdelta error message to stderr.
*
* This function handles @s the same way perror does.
*/
void bdelta_perror(const char *s, BDELTAcode code);
#endif
#include <ccan/bdelta/bdelta.h>
#include <ccan/bdelta/bdelta.c>
#include <ccan/tap/tap.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
/*
* Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
* Uses the BCPL linear congruential generator method.
*
* Used instead of system RNG to ensure tests are consistent.
*/
static uint32_t rand32(void)
{
#if 0
/*
* Tests should be run with a different random function
* from time to time. I've found that the method below
* sometimes behaves poorly for testing purposes.
* For example, rand32() % N might only return even numbers.
*/
assert(RAND_MAX == 2147483647);
return ((random() & 0xFFFF) << 16) | (random() & 0xFFFF);
#else
static uint32_t rand32_state = 0;
rand32_state *= (uint32_t)0x7FF8A3ED;
rand32_state += (uint32_t)0x2AA01D31;
return rand32_state;
#endif
}
#define RSTRING_OK 0 /* Operation succeeded */
#define RSTRING_MEMORY 1 /* Memory allocation failed */
#define RSTRING_ZERO_CARDINALITY 2 /* Cardinality of byte range is zero */
#define RSTRING_RANGE_OVERLAP 3 /* Byte range contains overlapping characters */
struct rstring_byte_range {
unsigned int cardinality;
unsigned int multiplier;
unsigned int offset;
};
static int check_range(const struct rstring_byte_range *range)
{
unsigned int i;
unsigned int c;
char set[256];
if (range == NULL)
return RSTRING_OK;
if (range->cardinality == 0)
return RSTRING_ZERO_CARDINALITY;
memset(set, 0, sizeof(set));
c = range->offset & 0xFF;
for (i = 0; i < range->cardinality; i++) {
if (set[c])
return RSTRING_RANGE_OVERLAP;
set[c] = 1;
c += range->multiplier;
c &= 0xFF;
}
return RSTRING_OK;
}
/*
* Generate a pseudorandom string of the given length,
* writing it into a user-supplied buffer.
*/
static uint8_t *random_string_into(uint8_t *str, uint32_t size, const struct rstring_byte_range *range)
{
uint32_t i;
uint32_t r;
if (range == NULL) {
for (i = 0; size - i >= 4; ) {
r = rand32();
str[i++] = r & 0xFF;
r >>= 8;
str[i++] = r & 0xFF;
r >>= 8;
str[i++] = r & 0xFF;
r >>= 8;
str[i++] = r & 0xFF;
}
if (i < size) {
r = rand32();
do {
str[i++] = r & 0xFF;
r >>= 8;
} while (i < size);
}
} else {
for (i = 0; i < size; )
str[i++] = ((rand32() % range->cardinality) * range->multiplier + range->offset) & 0xFF;
}
return str;
}
/*
* Generate a pseudorandom string of the given length.
*/
static uint8_t *random_string(uint32_t size, const struct rstring_byte_range *range)
{
uint8_t *str;
if (check_range(range) != RSTRING_OK) {
fprintf(stderr, "Invalid byte range for random string generation\n");
exit(EXIT_FAILURE);
}
str = malloc(size);
if (str == NULL)
return NULL;
return random_string_into(str, size, range);
}
/*
* Generate two byte strings:
*
* An "old" string, of length @old_size
* A "new" string, differing from "old" by at most @diff_size bytes
* (where modifying a character is considered one change).
*
* Returns RSTRING_OK on success, RSTRING_MEMORY if memory allocation fails.
*/
static int random_string_pair(
uint32_t old_size, uint32_t diff_size, const struct rstring_byte_range *range,
uint8_t **old_out, uint8_t **new_out, uint32_t *new_size_out)
{
uint8_t *old;
uint8_t *new_;
uint8_t *nptr;
uint32_t new_size;
uint32_t i;
uint32_t j;
/* insertions[i] is the number of characters to insert before position i. */
uint32_t *insertions;
/*
* changes[i] indicates what to do to the character at position i:
*
* 0: Leave it as-is.
* 1: Delete it.
* 2: Change it to another byte (possibly the same byte).
*/
char *changes;
/* String of new bytes to use for insertions and changes. */
uint8_t *new_bytes;
uint8_t *new_bytes_ptr;
uint32_t new_bytes_size;
old = random_string(old_size, range);
if (old == NULL)
goto oom0;
insertions = calloc(old_size + 1, sizeof(*insertions));
if (insertions == NULL)
goto oom1;
changes = calloc(old_size, sizeof(*changes));
if (changes == NULL)
goto oom2;
/*
* Generate a collection of edits, which will be
* applied to the old string "simultaneously".
*/
new_size = old_size;
new_bytes_size = 0;
for (i = 0; i < diff_size; i++) {
uint32_t pos;
switch (rand32() % 3) {
case 0: /* Insert a byte. */
pos = rand32() % (old_size + 1);
insertions[pos]++;
new_size++;
new_bytes_size++;
break;
case 1: /* Delete a byte. */
pos = rand32() % old_size;
if (changes[pos]) {
/*
* This character is already deleted or changed.
* Do something else instead.
*
* For the observative: i will overflow if it's 0.
* However, overflow of unsigned integers is well-defined
* by the C standard. i will wrap around to UINT32_MAX,
* then the for-loop increment above will wrap it back to 0.
*/
i--;
} else {
changes[pos] = 1;
new_size--;
}
break;
default: /* Change a byte. */
pos = rand32() % old_size;
if (changes[pos]) {
/*
* This character is already deleted or changed.
* Do something else instead.
*/
i--;
} else {
changes[pos] = 2;
new_bytes_size++;
}
break;
}
}
new_bytes = malloc(new_bytes_size);
if (new_bytes == NULL)
goto oom3;
random_string_into(new_bytes, new_bytes_size, range);
new_ = malloc(new_size);
if (new_ == NULL)
goto oom4;
/* Apply the insertions and changes generated above. */
nptr = new_;
new_bytes_ptr = new_bytes;
for (i = 0; i < old_size; i++) {
for (j = 0; j < insertions[i]; j++)
*nptr++ = *new_bytes_ptr++;
switch (changes[i]) {
case 0: /* No change */
*nptr++ = old[i];
break;
case 1: /* Delete */
break;
default: /* Change value */
*nptr++ = *new_bytes_ptr++;
break;
}
}
for (j = 0; j < insertions[i]; j++)
*nptr++ = *new_bytes_ptr++;
assert((size_t)(nptr - new_) == new_size);
assert((size_t)(new_bytes_ptr - new_bytes) == new_bytes_size);
free(new_bytes);
free(changes);
free(insertions);
if (old_out)
*old_out = old;
else
free(old);
if (new_out)
*new_out = new_;
else
free(new_);
if (new_size_out)
*new_size_out = new_size;
return RSTRING_OK;
oom4:
free(new_bytes);
oom3:
free(changes);
oom2:
free(insertions);
oom1:
free(old);
oom0:
if (old_out)
*old_out = NULL;
if (new_out)
*new_out = NULL;
if (new_size_out)
*new_size_out = 0;
return RSTRING_MEMORY;
}
#include "common.h"
/*
* Note that bdelta_diff verifies the patch before returning it (except for
* when it returns a PT_LITERAL patch, as its correctness is easy to prove).
* Only the trivial tests check the result explicitly using bdiff_patch.
*/
static int test_random(
uint32_t old_size, uint32_t diff_size,
unsigned int cardinality, unsigned int multiplier, unsigned int offset)
{
struct rstring_byte_range range;
uint8_t *old;
uint8_t *new_;
uint32_t new_size;
BDELTAcode rc;
range.cardinality = cardinality;
range.multiplier = multiplier;
range.offset = offset;
if (random_string_pair(old_size, diff_size, cardinality == 0 ? NULL : &range,
&old, &new_, &new_size) != RSTRING_OK)
{
fprintf(stderr, "Error generating random string pair\n");
exit(EXIT_FAILURE);
}
rc = bdelta_diff(old, old_size, new_, new_size, NULL, NULL);
if (rc != BDELTA_OK) {
bdelta_perror("bdelta_diff", rc);
return 0;
}
free(new_);
free(old);
return 1;
}
int main(void)
{
int i;
int count = 25;
plan_tests(count * 14);
for (i = 0; i < count; i++)
ok1(test_random(100, 10, 0, 0, 0));
for (i = 0; i < count; i++)
ok1(test_random(100, rand32() % 200, 0, 0, 0));
for (i = 0; i < count; i++)
ok1(test_random(1000, rand32() % 200, 0, 0, 0));
for (i = 0; i < count; i++)
ok1(test_random(1000, rand32() % 2000, 0, 0, 0));
for (i = 0; i < count; i++)
ok1(test_random(10000, rand32() % 200, 0, 0, 0));
for (i = 0; i < count; i++)
ok1(test_random(10000, rand32() % 2000, 0, 0, 0));
for (i = 0; i < count; i++)
ok1(test_random(rand32() % 20000, rand32() % 20000, 0, 0, 0));
/* Low-cardinality tests */
for (i = 0; i < count; i++)
ok1(test_random(100, 10, rand32() % 20 + 1, 1, i));
for (i = 0; i < count; i++)
ok1(test_random(100, rand32() % 200, rand32() % 20 + 1, 1, i));
for (i = 0; i < count; i++)
ok1(test_random(1000, rand32() % 200, rand32() % 20 + 1, 1, i));
for (i = 0; i < count; i++)
ok1(test_random(1000, rand32() % 2000, rand32() % 20 + 1, 1, i));
for (i = 0; i < count; i++)
ok1(test_random(10000, rand32() % 200, rand32() % 20 + 1, 1, i));
for (i = 0; i < count; i++)
ok1(test_random(10000, rand32() % 2000, rand32() % 20 + 1, 1, i));
for (i = 0; i < count; i++)
ok1(test_random(rand32() % 20000, rand32() % 20000, rand32() % 20 + 1, 1, i));
return exit_status();
}
#include "common.h"
#define is_digit(c) ((c) >= '0' && (c) <= '9')
static uint32_t parse_int(const char **sp)
{
const char *s = *sp;
uint32_t n = 0;
while (is_digit(*s)) {
n *= 10;
n += *s++ - '0';
}
*sp = s;
return n;
}
static const char *op_name(int op)
{
switch (op) {
case OP_COPY:
return "copy";
case OP_SKIP:
return "skip";
case OP_INSERT:
return "insert";
default:
return "<invalid opcode>";
}
}
static const char *verify_csi32(
const unsigned char *patch_start, const unsigned char *patch_end,
const char *expected)
{
const unsigned char *p = patch_start;
if (p >= patch_end)
return "Patch type byte missing";
if (*p++ != PT_CSI32)
return "Patch type byte is not PT_CSI32";
for (;;) {
int patch_op;
uint32_t patch_size;
int expected_op;
uint32_t expected_size;
while (*expected == ' ')
expected++;
if (*expected == '\0')
break;
if (!csi32_parse_op(&p, patch_end, &patch_op, &patch_size)) {
if (p == patch_end)
return "Patch shorter than expected.";
else
return "Patch contains invalid CSI-32";
}
switch (*expected) {
case 'c':
expected_op = OP_COPY;
break;
case 's':
expected_op = OP_SKIP;
break;
case 'i':
expected_op = OP_INSERT;
break;
default:
diag("verify_csi32: Unexpected character %c", *expected);
return "Syntax error in expected difference";
}
expected++;
while (*expected == ' ')
expected++;
if (patch_op != expected_op) {
diag("verify_csi32: Expected %s, but next op is %s %lu",
op_name(expected_op), op_name(patch_op), (unsigned long)patch_size);
return "Operation mismatch";
}
if (expected_op == OP_COPY || expected_op == OP_SKIP) {
if (!is_digit(*expected)) {
diag("verify_csi32: Expected size after %s instruction",
op_name(expected_op));
return "Syntax error in expected difference";
}
expected_size = parse_int(&expected);
if (patch_size != expected_size) {
diag("verify_csi32: Expected %s %lu, but patch says %s %lu",
op_name(expected_op), (unsigned long)expected_size,
op_name(expected_op), (unsigned long)patch_size);
return "Operation size mismatch";
}
} else {
if (*expected++ != '\'') {
diag("verify_csi32: Expected single-quoted string after insert instruction");
return "Syntax error in expected difference";
}
for (expected_size = 0; ; expected_size++) {
unsigned char c;
if (*expected == '\'' && *(expected + 1) == '\'') {
c = '\'';
expected += 2;
} else if (*expected == '\'') {
expected++;
break;
} else if (*expected == '\0') {
diag("verify_csi32: Missing end quote");
return "Syntax error in expected difference";
} else {
c = *expected++;
}
if (!(p < patch_end && *p++ == c))
return "Insertion string mismatch";
}
if (patch_size != expected_size)
return "Insertion string mismatch";
}
}
if (p != patch_end)
return "Patch longer than expected.";
return NULL;
}
static void test_myers(const char *old, const char *new_, const char *expected_difference)
{
SB patch;
BDELTAcode rc;
const char *verify_msg;
if (sb_init(&patch) != 0) {
fail("Out of memory");
return;
}
rc = diff_myers(old, strlen(old), new_, strlen(new_), &patch);
if (rc != BDELTA_OK) {
fail("test_myers(%s, %s, %s): diff_myers failed: %s", old, new_, expected_difference, bdelta_strerror(rc));
sb_discard(&patch, NULL, NULL);
return;
}
verify_msg = verify_csi32(patch.start, patch.cur, expected_difference);
sb_discard(&patch, NULL, NULL);
if (verify_msg != NULL) {
fail("test_myers(%s, %s, %s): %s", old, new_, expected_difference, verify_msg);
return;
}
pass("test_myers(%s, %s, %s)", old, new_, expected_difference);
}
int main(void)
{
(void) random_string_pair;
plan_tests(17);
test_myers("abcabba", "cbabac", "s2 c1 i'b' c2 s1 c1 i'c'");
test_myers("abcdefg", "abcdefg", "c7");
test_myers("abcdefg", "abcde", "c5");
test_myers("abcdefg", "abcdefga", "c7 i'a'");
test_myers("abbbbbb", "bbbbbb", "s1 c6");
test_myers("abbbbbb", "bbbbbbb", "s1 c6 i'b'");
test_myers("bbbbbb", "abbbbbb", "i'a' c6");
test_myers("bbbbbbb", "abbbbbb", "i'a' c6");
test_myers("bbbbbb", "abbbbbbb", "i'a' c6 i'b'");
test_myers("ac", "ba", "i'b' c1");
test_myers("ac", "bc", "s1 i'b' c1");
test_myers("abcd", "cabd", "i'c' c2 s1 c1");
test_myers("", "abc", "i'abc'");
test_myers("abc", "", "");
test_myers("abcd", "d", "s3 c1");
test_myers("", "", "");
/*
* In the event the strings have no characters in common, diff_myers will
* emit a skip of all the characters in the old string. Although it is
* unnecessary, it is tricky to get rid of. bdelta_diff will discard the
* patch anyway, because it's bigger than a literal.
*/
test_myers("aaaaaaa", "bbbbbbb", "s7 i'bbbbbbb'");
return exit_status();
}
#include <ccan/bdelta/bdelta.h>
#include <ccan/bdelta/bdelta.c>
#include <ccan/tap/tap.h>
static int test_trivial(const char *old, const char *new_)
{
void *patch;
size_t patch_size;
BDELTAcode rc;
void *new2;
size_t new2_size;
rc = bdelta_diff(old, strlen(old), new_, strlen(new_), &patch, &patch_size);
if (rc != BDELTA_OK) {
bdelta_perror("bdelta_diff", rc);
return 0;
}
if (patch_size > strlen(new_) + 1) {
fprintf(stderr, "bdelta_diff produced a patch larger than a simple literal emitting the new string.\n");
return 0;
}
rc = bdelta_patch(old, strlen(old), patch, patch_size, &new2, &new2_size);
if (rc != BDELTA_OK) {
bdelta_perror("bdelta_patch", rc);
return 0;
}
if (new2_size != strlen(new_) || strcmp(new2, new_) != 0) {
fprintf(stderr, "patch(old, diff(old, new)) != new\n");
return 0;
}
/* Make sure bdelta_diff properly discards unwanted return values. */
rc = bdelta_diff(old, strlen(old), new_, strlen(new_), NULL, NULL);
if (rc != BDELTA_OK) {
bdelta_perror("bdelta_diff (second time)", rc);
return 0;
}
free(new2);
free(patch);
return 1;
}
int main(void)
{
plan_tests(13);
ok1(test_trivial("abcabba", "cbabac"));
ok1(test_trivial("aaabbbcdaabcc", "aaabbcdaabeca"));
ok1(test_trivial("aaaaaaaa", "bbbbbbbb"));
ok1(test_trivial("aaaaaaaa", ""));
ok1(test_trivial("", "bbbbbbbb"));
ok1(test_trivial("", ""));
ok1(test_trivial("aaaaaaaa", "aaaaaaaabbbbbbbb"));
ok1(test_trivial("aaaaaaaa", "bbbbbbbbaaaaaaaa"));
ok1(test_trivial("aaaaaaaabbbbbbbb", "aaaaaaaa"));
ok1(test_trivial("aaaaaaaabbbbbbbb", "bbbbbbbb"));
ok1(test_trivial("aaaaaaaabbbbbbbb", "bbbbbbbb"));
ok1(test_trivial("abababababababab", "babababababababa"));
ok1(test_trivial("aababcabcdabcde", "aababcabcdabcde"));
return exit_status();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment