xz 源码阅读 - 2

上篇文章说到coder->sequence被设置成了“SEQ_BLOCK”。

// Fall through

case SEQ_BLOCK: {
    const lzma_ret ret = coder->block_decoder.code(
            coder->block_decoder.coder, allocator,
            in, in_pos, in_size, out, out_pos, out_size,
            action);

    if (ret != LZMA_STREAM_END)
        return ret;

    // Block decoded successfully. Add the new size pair to
    // the Index hash.
    return_if_error(lzma_index_hash_append(coder->index_hash,
            lzma_block_unpadded_size(
                &coder->block_options),
            coder->block_options.uncompressed_size));

    coder->sequence = SEQ_BLOCK_HEADER;
    break;
}

随后,这里就开始调用block_decoder.code。前面他被设置成了block_decode:

next->coder = coder;
next->code = &block_decode;
next->end = &block_decoder_end;
coder->next = LZMA_NEXT_CODER_INIT;

// Basic initializations
coder->sequence = SEQ_CODE;
coder->block = block;
coder->compressed_size = 0;
coder->uncompressed_size = 0;

因此,我们查看block_decode的代码。由于代码比较长,这里继续加序号阅读。

static lzma_ret
block_decode(void *coder_ptr, const lzma_allocator *allocator,
        const uint8_t *restrict in, size_t *restrict in_pos,
        size_t in_size, uint8_t *restrict out,
        size_t *restrict out_pos, size_t out_size, lzma_action action)
{
    lzma_block_coder *coder = coder_ptr;

block_decode:1. 注意这里的coder->sequence是另一个“sequence”,它最开始是被初始化成SEQ_CODE的。因此第一步从调用coder->next.code开始。

coder = coder_ptr(参数1)是上一步的coder->block_decoder.coder,因此这个调用会调用:coder->block_decoder.coder->next.coder,这些decoder由filter决定。

    switch (coder->sequence) {
    case SEQ_CODE: {
        const size_t in_start = *in_pos;
        const size_t out_start = *out_pos;

        const lzma_ret ret = coder->next.code(coder->next.coder,
                allocator, in, in_pos, in_size,
                out, out_pos, out_size, action);

        const size_t in_used = *in_pos - in_start;
        const size_t out_used = *out_pos - out_start;

        // NOTE: We compare to compressed_limit here, which prevents
        // the total size of the Block growing past LZMA_VLI_MAX.
        if (update_size(&coder->compressed_size, in_used,
                    coder->compressed_limit)
                || update_size(&coder->uncompressed_size,
                    out_used,
                    coder->block->uncompressed_size))
            return LZMA_DATA_ERROR;

block_decode:2. 调用完成后,调用lzma_check_update。确定CRC32/64并校验。完成后,设置seq为SEQ_PADDING。

        if (!coder->ignore_check)
            lzma_check_update(&coder->check, coder->block->check,
                    out + out_start, out_used);

        if (ret != LZMA_STREAM_END)
            return ret;

        // Compressed and Uncompressed Sizes are now at their final
        // values. Verify that they match the values given to us.
        if (!is_size_valid(coder->compressed_size,
                    coder->block->compressed_size)
                || !is_size_valid(coder->uncompressed_size,
                    coder->block->uncompressed_size))
            return LZMA_DATA_ERROR;

        // Copy the values into coder->block. The caller
        // may use this information to construct Index.
        coder->block->compressed_size = coder->compressed_size;
        coder->block->uncompressed_size = coder->uncompressed_size;

        coder->sequence = SEQ_PADDING;
    }

block_decode:3. 进入SEQ_PADDING状态,将字符填充到4字节对齐状态。

    // Fall through

    case SEQ_PADDING:
        // Compressed Data is padded to a multiple of four bytes.
        while (coder->compressed_size & 3) {
            if (*in_pos >= in_size)
                return LZMA_OK;

            // We use compressed_size here just get the Padding
            // right. The actual Compressed Size was stored to
            // coder->block already, and won't be modified by
            // us anymore.
            ++coder->compressed_size;

            if (in[(*in_pos)++] != 0x00)
                return LZMA_DATA_ERROR;
        }

        if (coder->block->check == LZMA_CHECK_NONE)
            return LZMA_STREAM_END;

        if (!coder->ignore_check)
            lzma_check_finish(&coder->check, coder->block->check);

        coder->sequence = SEQ_CHECK;

block_decode:4. 进入SEQ_CHECK状态,首先获取checksize,拷贝原始数据并进行比较。随后结束。

    // Fall through

    case SEQ_CHECK: {
        const size_t check_size = lzma_check_size(coder->block->check);
        lzma_bufcpy(in, in_pos, in_size, coder->block->raw_check,
                &coder->check_pos, check_size);
        if (coder->check_pos < check_size)
            return LZMA_OK;

        // Validate the Check only if we support it.
        // coder->check.buffer may be uninitialized
        // when the Check ID is not supported.
        if (!coder->ignore_check
                && lzma_check_is_supported(coder->block->check)
                && memcmp(coder->block->raw_check,
                    coder->check.buffer.u8,
                    check_size) != 0)
            return LZMA_DATA_ERROR;

        return LZMA_STREAM_END;
    }
    }

    return LZMA_PROG_ERROR;
}

回到最开始的地方,进入 coder->sequence = SEQ_BLOCK_HEADER; 状态。这是解压的上一个状态,我们跳过看下一个SEQ_INDEX。

case SEQ_INDEX: {
    // If we don't have any input, don't call
    // lzma_index_hash_decode() since it would return
    // LZMA_BUF_ERROR, which we must not do here.
    if (*in_pos >= in_size)
        return LZMA_OK;

    // Decode the Index and compare it to the hash calculated
    // from the sizes of the Blocks (if any).
    const lzma_ret ret = lzma_index_hash_decode(coder->index_hash,
            in, in_pos, in_size);
    if (ret != LZMA_STREAM_END)
        return ret;

    coder->sequence = SEQ_STREAM_FOOTER;
}

// Fall through

主代码为lzma_index_hash_decode。整体代码非常简单,这里不多做介绍了。

extern LZMA_API(lzma_ret)
lzma_index_hash_decode(lzma_index_hash *index_hash, const uint8_t *in,
        size_t *in_pos, size_t in_size)
{
    // Catch zero input buffer here, because in contrast to Index encoder
    // and decoder functions, applications call this function directly
    // instead of via lzma_code(), which does the buffer checking.
    if (*in_pos >= in_size)
        return LZMA_BUF_ERROR;

    // NOTE: This function has many similarities to index_encode() and
    // index_decode() functions found from index_encoder.c and
    // index_decoder.c. See the comments especially in index_encoder.c.
    const size_t in_start = *in_pos;
    lzma_ret ret = LZMA_OK;

    while (*in_pos < in_size)
    switch (index_hash->sequence) {
    case SEQ_BLOCK:
        // Check the Index Indicator is present.
        if (in[(*in_pos)++] != 0x00)
            return LZMA_DATA_ERROR;

        index_hash->sequence = SEQ_COUNT;
        break;

    case SEQ_COUNT: {
        ret = lzma_vli_decode(&index_hash->remaining,
                &index_hash->pos, in, in_pos, in_size);
        if (ret != LZMA_STREAM_END)
            goto out;

        // The count must match the count of the Blocks decoded.
        if (index_hash->remaining != index_hash->blocks.count)
            return LZMA_DATA_ERROR;

        ret = LZMA_OK;
        index_hash->pos = 0;

        // Handle the special case when there are no Blocks.
        index_hash->sequence = index_hash->remaining == 0
                ? SEQ_PADDING_INIT : SEQ_UNPADDED;
        break;
    }

    case SEQ_UNPADDED:
    case SEQ_UNCOMPRESSED: {
        lzma_vli *size = index_hash->sequence == SEQ_UNPADDED
                ? &index_hash->unpadded_size
                : &index_hash->uncompressed_size;

        ret = lzma_vli_decode(size, &index_hash->pos,
                in, in_pos, in_size);
        if (ret != LZMA_STREAM_END)
            goto out;

        ret = LZMA_OK;
        index_hash->pos = 0;

        if (index_hash->sequence == SEQ_UNPADDED) {
            if (index_hash->unpadded_size < UNPADDED_SIZE_MIN
                    || index_hash->unpadded_size
                        > UNPADDED_SIZE_MAX)
                return LZMA_DATA_ERROR;

            index_hash->sequence = SEQ_UNCOMPRESSED;
        } else {
            // Update the hash.
            return_if_error(hash_append(&index_hash->records,
                    index_hash->unpadded_size,
                    index_hash->uncompressed_size));

            // Verify that we don't go over the known sizes. Note
            // that this validation is simpler than the one used
            // in lzma_index_hash_append(), because here we know
            // that values in index_hash->blocks are already
            // validated and we are fine as long as we don't
            // exceed them in index_hash->records.
            if (index_hash->blocks.blocks_size
                    < index_hash->records.blocks_size
                    || index_hash->blocks.uncompressed_size
                    < index_hash->records.uncompressed_size
                    || index_hash->blocks.index_list_size
                    < index_hash->records.index_list_size)
                return LZMA_DATA_ERROR;

            // Check if this was the last Record.
            index_hash->sequence = --index_hash->remaining == 0
                    ? SEQ_PADDING_INIT : SEQ_UNPADDED;
        }

        break;
    }

    case SEQ_PADDING_INIT:
        index_hash->pos = (LZMA_VLI_C(4) - index_size_unpadded(
                index_hash->records.count,
                index_hash->records.index_list_size)) & 3;
        index_hash->sequence = SEQ_PADDING;

    // Fall through

    case SEQ_PADDING:
        if (index_hash->pos > 0) {
            --index_hash->pos;
            if (in[(*in_pos)++] != 0x00)
                return LZMA_DATA_ERROR;

            break;
        }

        // Compare the sizes.
        if (index_hash->blocks.blocks_size
                != index_hash->records.blocks_size
                || index_hash->blocks.uncompressed_size
                != index_hash->records.uncompressed_size
                || index_hash->blocks.index_list_size
                != index_hash->records.index_list_size)
            return LZMA_DATA_ERROR;

        // Finish the hashes and compare them.
        lzma_check_finish(&index_hash->blocks.check, LZMA_CHECK_BEST);
        lzma_check_finish(&index_hash->records.check, LZMA_CHECK_BEST);
        if (memcmp(index_hash->blocks.check.buffer.u8,
                index_hash->records.check.buffer.u8,
                lzma_check_size(LZMA_CHECK_BEST)) != 0)
            return LZMA_DATA_ERROR;

        // Finish the CRC32 calculation.
        index_hash->crc32 = lzma_crc32(in + in_start,
                *in_pos - in_start, index_hash->crc32);

        index_hash->sequence = SEQ_CRC32;

    // Fall through

    case SEQ_CRC32:
        do {
            if (*in_pos == in_size)
                return LZMA_OK;

            if (((index_hash->crc32 >> (index_hash->pos * 8))
                    & 0xFF) != in[(*in_pos)++])
                return LZMA_DATA_ERROR;

        } while (++index_hash->pos < 4);

        return LZMA_STREAM_END;

    default:
        assert(0);
        return LZMA_PROG_ERROR;
    }

out:
    // Update the CRC32,
    index_hash->crc32 = lzma_crc32(in + in_start,
            *in_pos - in_start, index_hash->crc32);

    return ret;
}

回到上一层,状态SEQ_STREAM_FOOTER,代码如下。

case SEQ_STREAM_FOOTER: {
    // Copy the Stream Footer to the internal buffer.
    lzma_bufcpy(in, in_pos, in_size, coder->buffer, &coder->pos,
            LZMA_STREAM_HEADER_SIZE);

    // Return if we didn't get the whole Stream Footer yet.
    if (coder->pos < LZMA_STREAM_HEADER_SIZE)
        return LZMA_OK;

    coder->pos = 0;

    // Decode the Stream Footer. The decoder gives
    // LZMA_FORMAT_ERROR if the magic bytes don't match,
    // so convert that return code to LZMA_DATA_ERROR.
    lzma_stream_flags footer_flags;

lzma_stream_footer_decode用于解码footer,解码flags并设置backward_size。校验footer size,并返回结果。

    const lzma_ret ret = lzma_stream_footer_decode(
            &footer_flags, coder->buffer);
    if (ret != LZMA_OK)
        return ret == LZMA_FORMAT_ERROR
                ? LZMA_DATA_ERROR : ret;

    // Check that Index Size stored in the Stream Footer matches
    // the real size of the Index field.
    if (lzma_index_hash_size(coder->index_hash)
            != footer_flags.backward_size)
        return LZMA_DATA_ERROR;

    // Compare that the Stream Flags fields are identical in
    // both Stream Header and Stream Footer.
    return_if_error(lzma_stream_flags_compare(
            &coder->stream_flags, &footer_flags));

    if (!coder->concatenated)
        return LZMA_STREAM_END;

    coder->sequence = SEQ_STREAM_PADDING;
}

// Fall through

然后是SEQ_STREAM_PADDING状态。这个状态也没有什么新奇的东西,就是检查要多少个字节的padding。

    case SEQ_STREAM_PADDING:
        assert(coder->concatenated);

        // Skip over possible Stream Padding.
        while (true) {
            if (*in_pos >= in_size) {
                // Unless LZMA_FINISH was used, we cannot
                // know if there's more input coming later.
                if (action != LZMA_FINISH)
                    return LZMA_OK;

                // Stream Padding must be a multiple of
                // four bytes.
                return coder->pos == 0
                        ? LZMA_STREAM_END
                        : LZMA_DATA_ERROR;
            }

            // If the byte is not zero, it probably indicates
            // beginning of a new Stream (or the file is corrupt).
            if (in[*in_pos] != 0x00)
                break;

            ++*in_pos;
            coder->pos = (coder->pos + 1) & 3;
        }

        // Stream Padding must be a multiple of four bytes (empty
        // Stream Padding is OK).
        if (coder->pos != 0) {
            ++*in_pos;
            return LZMA_DATA_ERROR;
        }

        // Prepare to decode the next Stream.
        return_if_error(stream_decoder_reset(coder, allocator));
        break;

    default:
        assert(0);
        return LZMA_PROG_ERROR;
    }

    // Never reached
}

标签:none

添加新评论

captcha
请输入验证码