stuff-from-scratch/src/compression/Lz77Encoder.cpp

331 lines
8.4 KiB
C++
Raw Normal View History

2022-11-23 15:41:33 +00:00
#include "Lz77Encoder.h"
#include "StringUtils.h"
#include "BitStream.h"
2022-11-28 10:16:04 +00:00
#include "ByteUtils.h"
#include "HuffmanEncoder.h"
#include <iostream>
2022-11-23 15:41:33 +00:00
Lz77Encoder::Lz77Encoder(BitStream* inputStream, BitStream* outputStream)
: AbstractEncoder(inputStream, outputStream),
mSearchBuffer(mSearchBufferSize),
mLookaheadBuffer(mLookAheadBufferSize)
2022-11-23 15:41:33 +00:00
{
}
void Lz77Encoder::setPrefixCodeGenerator(std::unique_ptr<PrefixCodeGenerator> generator)
{
mCodeGenerator = std::move(generator);
}
bool Lz77Encoder::hitBufferFull() const
{
return mHitBuffer.size() == mMaxHitBufferSize;
}
2022-11-23 15:41:33 +00:00
void Lz77Encoder::populateSearchBuffer(const Hit& hit)
2022-11-23 15:41:33 +00:00
{
const auto& [length, distance, next_char] = hit;
2022-11-23 15:41:33 +00:00
if (length == 0)
{
mSearchBuffer.addItem(next_char);
}
else
2022-11-23 15:41:33 +00:00
{
std::vector<unsigned char> new_items(distance, 0);
for(unsigned idx=0 ;idx<distance; idx++)
2022-11-23 15:41:33 +00:00
{
new_items[idx] = getSearchBufferItem(idx);
2022-11-23 15:41:33 +00:00
}
for(auto item : new_items)
{
mSearchBuffer.addItem(item);
}
int difference = int(length) - distance;
if (difference > 0)
2022-11-23 15:41:33 +00:00
{
for(unsigned idx=0; idx<difference; idx++)
{
mSearchBuffer.addItem(mLookaheadBuffer.getItem(idx));
}
2022-11-23 15:41:33 +00:00
}
}
}
unsigned char Lz77Encoder::getSearchBufferItem(unsigned index) const
{
return mSearchBuffer.getItem(mSearchBuffer.getNumItems() - 1 - index);
}
2022-11-23 15:41:33 +00:00
unsigned Lz77Encoder::lookAheadForMatchingChars(unsigned distance)
{
unsigned length{0};
for(unsigned idx=0; idx<mMaxLookAheadBufferIndex + 1; idx++)
{
int search_offset = int(distance-1) - idx;
unsigned char search_char{0};
if (search_offset < 0)
2022-11-23 15:41:33 +00:00
{
search_char = mLookaheadBuffer.getItem(-search_offset - 1);
2022-11-23 15:41:33 +00:00
}
else
{
search_char = getSearchBufferItem(static_cast<unsigned>(search_offset));
}
unsigned char lookahead_char = mLookaheadBuffer.getItem(idx);
if ((lookahead_char != search_char) || (idx == mMaxLookAheadBufferIndex))
{
if (idx + 1>= mMinLengthMatchSize)
{
length = idx + 1;
}
2022-11-23 15:41:33 +00:00
break;
}
}
return length;
2022-11-23 15:41:33 +00:00
}
void Lz77Encoder::lookForMatches(unsigned char searchChar, unsigned& hitLength, unsigned& hitOffset)
2022-11-23 15:41:33 +00:00
{
for (unsigned idx = 0; idx< mSearchBuffer.getNumItems(); idx++)
2022-11-23 15:41:33 +00:00
{
if (mSearchBuffer.getItem(mSearchBuffer.getNumItems() - 1 - idx) == searchChar)
2022-11-23 15:41:33 +00:00
{
auto num_hits = lookAheadForMatchingChars(idx + 1);
if (num_hits > 0 && num_hits >= hitLength)
2022-11-23 15:41:33 +00:00
{
hitLength = num_hits;
hitOffset = idx + 1;
}
}
}
}
bool Lz77Encoder::lookAheadSourceEmpty() const
2022-11-28 10:16:04 +00:00
{
if (mLookaheadBuffer.getNumItems() < mLookAheadBufferSize)
{
return true;
}
if (mMaxLookAheadBufferIndex < mLookAheadBufferSize - 1)
{
return true;
}
return false;
}
void Lz77Encoder::populateLookaheadBuffer(unsigned size, bool firstPass)
{
if (!firstPass && lookAheadSourceEmpty())
{
for(unsigned idx=0; idx<size; idx++)
{
mLookaheadBuffer.addItem(0);
mMaxLookAheadBufferIndex--;
}
return;
}
bool stream_finished{false};
unsigned stream_end_id{0};
for(unsigned idx=0; idx<size; idx++)
{
if (!stream_finished)
{
auto byte = mInputStream->readNextByte();
if (!byte)
{
stream_finished = true;
stream_end_id = idx -1;
mLookaheadBuffer.addItem(0);
mMaxLookAheadBufferIndex--;
continue;
}
else
{
mLookaheadBuffer.addItem(*byte);
}
}
else
{
mLookaheadBuffer.addItem(0);
mMaxLookAheadBufferIndex--;
}
}
if (stream_finished && firstPass)
{
mMaxLookAheadBufferIndex = stream_end_id;
}
2022-11-28 10:16:04 +00:00
}
2022-11-23 15:41:33 +00:00
bool Lz77Encoder::encode()
{
2022-11-28 10:16:04 +00:00
if (!mCodeGenerator)
{
mCodeGenerator = std::make_unique<HuffmanEncoder>();
}
2022-11-28 10:16:04 +00:00
// Fill the lookahead buffer
mMaxLookAheadBufferIndex = mLookAheadBufferSize - 1;
populateLookaheadBuffer(mLookAheadBufferSize, true);
if(mMaxLookAheadBufferIndex < 0)
{
return true;
2022-11-28 10:16:04 +00:00
}
bool input_stream_ended{false};
while(!hitBufferFull())
2022-11-28 10:16:04 +00:00
{
if (mMaxLookAheadBufferIndex < 0)
{
input_stream_ended = true;
break;
}
2022-11-28 10:16:04 +00:00
const auto working_byte = mLookaheadBuffer.getItem(0);
unsigned hit_length{0};
unsigned hit_distance{0};
lookForMatches(working_byte, hit_length, hit_distance);
2022-11-28 10:16:04 +00:00
const Hit hit{hit_length, hit_distance, working_byte};
mHitBuffer.push_back(hit);
2022-11-28 10:16:04 +00:00
populateSearchBuffer(hit);
if (hit_length == 0)
{
populateLookaheadBuffer(1);
}
else
{
populateLookaheadBuffer(hit_length);
}
}
2022-11-28 10:16:04 +00:00
return input_stream_ended;
}
2022-11-28 10:16:04 +00:00
const std::vector<Lz77Encoder::Hit>& Lz77Encoder::getHitBuffer() const
{
return mHitBuffer;
}
2022-11-23 15:41:33 +00:00
/*
void Lz77Encoder::flushHitBuffer()
{
// If dynamic huffman build trees
if (!mCodeGenerator)
2022-11-23 15:41:33 +00:00
{
mCodeGenerator = std::make_unique<HuffmanEncoder>();
}
2022-11-23 15:41:33 +00:00
// Convert hit buffer to prefix codes and write to output stream
for (const auto& hit : mHitBuffer)
{
const auto& [length, distance, next_char] = hit;
2022-11-23 15:41:33 +00:00
PrefixCode code;
if (length == 0)
2022-11-23 15:41:33 +00:00
{
code = *mCodeGenerator->getLiteralValue(next_char);
std::cout << "Writing symbol " << static_cast<int>(next_char) << " with code " << ByteUtils::toString(code.getData(), code.getLength()) << "\n";
2022-11-23 15:41:33 +00:00
mOutputStream->writeNBits(code.getData(), code.getLength());
2022-11-23 15:41:33 +00:00
}
else
{
code = *mCodeGenerator->getLengthValue(length);
const auto distance_code = mCodeGenerator->getDistanceValue(distance);
std::cout << "Writing length " << length << " with code " << ByteUtils::toString(code.getData(), code.getLength()) << "\n";
mOutputStream->writeNBits(code.getData(), code.getLength());
std::cout << "Writing distance " << distance << " with code " << ByteUtils::toString(distance_code.getData(), distance_code.getLength()) << "\n";
mOutputStream->writeNBits(distance_code.getData(), distance_code.getLength());
2022-11-23 15:41:33 +00:00
}
}
auto eos_code = mCodeGenerator->getEndOfStreamValue();
std::cout << "Writing EOS value with code " << ByteUtils::toString(eos_code->getData(), eos_code->getLength()) << "\n";
mOutputStream->writeNBits(eos_code->getData(), eos_code->getLength());
2022-11-23 15:41:33 +00:00
}
*/
2022-11-23 15:41:33 +00:00
bool Lz77Encoder::decode()
{
/*
std::string ret;
unsigned loc{0};
while(loc < stream.size())
{
auto working_char = stream[loc];
if (working_char == '@')
{
unsigned loc_working = loc;
auto remainder = stream.size() - loc;
std::string offset;
unsigned length_loc{0};
for(unsigned jdx=0; jdx< remainder; jdx++)
{
loc++;
auto offset_char = stream[loc];
if (offset_char == 'L')
{
loc++;
break;
}
else
{
offset += offset_char;
}
}
unsigned offset_amount = std::stoul(offset);
std::string length;
remainder = stream.size() - loc;
for(unsigned jdx=0; jdx< remainder; jdx++)
{
auto length_char = stream[loc];
if (StringUtils::IsAlphabetical(length_char) || length_char == '@')
{
break;
}
else
{
loc++;
length += length_char;
}
}
unsigned length_amount = std::stoul(length);
auto buffer_index = ret.size() - offset_amount;
for(unsigned jdx=buffer_index;jdx<buffer_index+length_amount; jdx++)
{
ret += ret[jdx];
}
}
else
{
loc++;
ret += working_char;
}
}
return ret;
*/
return false;
}