From 318b481ccc765e66b3ebe42e8c4cdf8ded2493c8 Mon Sep 17 00:00:00 2001 From: James Grogan Date: Tue, 22 Nov 2022 17:37:06 +0000 Subject: [PATCH] Add some bit utils and initial l77 encoder. --- src/compression/Lz77Encoder.cpp | 0 src/compression/Lz77Encoder.h | 168 ++++++++++++++++++++++ src/compression/ZlibData.h | 32 +++-- src/core/ByteUtils.cpp | 163 +++++++++++++++++++++ src/core/ByteUtils.h | 114 +++------------ src/core/CMakeLists.txt | 2 + src/core/streams/BitStream.cpp | 56 ++++++++ src/core/streams/BitStream.h | 27 ++++ test/CMakeLists.txt | 1 + test/compression/TestStreamCompressor.cpp | 22 ++- test/core/TestByteUtils.cpp | 18 +++ test/core/TestTomlReader.cpp | 22 +-- 12 files changed, 508 insertions(+), 117 deletions(-) create mode 100644 src/compression/Lz77Encoder.cpp create mode 100644 src/compression/Lz77Encoder.h create mode 100644 src/core/ByteUtils.cpp create mode 100644 src/core/streams/BitStream.cpp create mode 100644 src/core/streams/BitStream.h create mode 100644 test/core/TestByteUtils.cpp diff --git a/src/compression/Lz77Encoder.cpp b/src/compression/Lz77Encoder.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/compression/Lz77Encoder.h b/src/compression/Lz77Encoder.h new file mode 100644 index 0000000..4aa96ad --- /dev/null +++ b/src/compression/Lz77Encoder.h @@ -0,0 +1,168 @@ +#pragma once + +#include "StringUtils.h" + +#include +#include + +class Lz77Encoder +{ +public: + using DataStream = std::vector; + + unsigned lookAheadForMatchingChars(std::vector& matchBuffer, unsigned searchIndex, unsigned hitOffset, const std::string& stream, unsigned streamLoc) + { + auto remaining_size = stream.size() - streamLoc; + + unsigned num_hits{1}; + for (unsigned jdx=1; jdx< remaining_size; jdx++) + { + char buffer_char{0}; + if (searchIndex + jdx < mSearchBuffer.size()) + { + buffer_char = mSearchBuffer[searchIndex + jdx]; + } + else + { + buffer_char = stream[jdx - hitOffset]; + } + + auto lookahead_char = stream[streamLoc + jdx]; + if (lookahead_char == buffer_char) + { + matchBuffer.push_back(buffer_char); + num_hits++; + } + else + { + break; + } + } + return num_hits; + } + + void lookThroughSearchBuffer(char searchChar, unsigned& hitLength, unsigned& hitOffset, const std::string& stream, unsigned streamLoc) + { + for(unsigned idx=0; idx match_buffer{buffer_char}; + auto num_hits = lookAheadForMatchingChars(match_buffer, search_index, idx, stream, streamLoc); + + if (num_hits >= hitLength) + { + hitLength = num_hits; + hitOffset = idx + 1; + } + } + } + } + + std::string encode(const std::string& stream) + { + unsigned loc{0}; + std::string ret; + + while(loc < stream.size()) + { + auto search_char = stream[loc]; + + unsigned hit_length{0}; + unsigned hit_offset{0}; + lookThroughSearchBuffer(search_char, hit_length, hit_offset, stream, loc); + + if (hit_length > 0) + { + ret += "@" + std::to_string(hit_offset) + "L" + std::to_string(hit_length); + loc+=hit_length; + + auto hit_loc = mSearchBuffer.size() - hit_offset; + for(unsigned idx=hit_loc; idx #include @@ -10,12 +11,12 @@ class ZlibData public: void setByte(unsigned idx, unsigned char data) { - mData[idx] = data; + mBitStream.setByte(idx, data); } void setDataSize(std::size_t size) { - mData = std::vector(size); + mBitStream.setBufferSize(size); } void setCompressionMethod(unsigned char method) @@ -48,22 +49,22 @@ public: unsigned char ERROR = 0x03; bool in_final_block = false; - unsigned working_byte_id = 0; - for (unsigned idx=0; idx(working_byte) << std::endl; + auto working_byte = mBitStream.getCurrentByte(); + std::cout << "Into process data, byte is: " << static_cast(working_byte) << std::endl; - auto final_block = ByteUtils::getBitN(working_byte, 0); + unsigned char final_block{0}; + mBitStream.getNextNBits(1, final_block); if (final_block) { std::cout << "Got final block" << std::endl; in_final_block = true; } - auto compress_type = ByteUtils::getTwoBitsAtN(working_byte, 1); - std::cout << "Compress type byte is: " << static_cast(compress_type) << std::endl; - + unsigned char compress_type{0}; + mBitStream.getNextNBits(2, compress_type); + std::cout << "Compress type byte is: " << static_cast(compress_type) << std::endl; if (compress_type == NO_COMPRESSION) { std::cout << "Got NO_COMPRESSION" << std::endl; @@ -75,6 +76,12 @@ public: else if (compress_type == DYNAMIC_HUFFMAN) { std::cout << "Got DYNAMIC_HUFFMAN" << std::endl; + + unsigned char h_list{0}; + mBitStream.getNextNBits(5, h_list); + mHlist = h_list + 257; + std::cout << "Got HLIST " << mHlist << std::endl; + } else if (compress_type == ERROR) { @@ -85,7 +92,10 @@ public: } private: - std::vector mData; + BitStream mBitStream; + + unsigned mHlist{0}; + unsigned char mCmf{0}; unsigned char mFlg{0}; unsigned char mCompressionMethod{0}; diff --git a/src/core/ByteUtils.cpp b/src/core/ByteUtils.cpp new file mode 100644 index 0000000..25829c9 --- /dev/null +++ b/src/core/ByteUtils.cpp @@ -0,0 +1,163 @@ +#include "ByteUtils.h" + + +bool ByteUtils::MostSignificantBitIsOne(char c) +{ + return c & (1 << 7); +} + +ByteUtils::Word ByteUtils::GetWordFirstBit(const Word word) +{ + return word & ByteUtils::WORD_FIRST_BIT; +}; + +ByteUtils::Word ByteUtils::GetWordLastByte(const Word word) +{ + return word & ByteUtils::WORD_LAST_BYTE; +} + +unsigned char ByteUtils::getHigherNBits(unsigned char input, unsigned num) +{ + return input >> 8 - num; +} + +unsigned char ByteUtils::getLowerNBits(unsigned char input, unsigned num) +{ + switch (num) + { + case 1: + return input & 0x01; + case 2: + return input & 0x03; + case 3: + return input & 0x07; + case 4: + return input & 0x0F; + case 5: + return input & 0x1F; + case 6: + return input & 0x3F; + case 7: + return input & 0x7F; + case 8: + return input; + default: + return 0; + } +} + +unsigned char ByteUtils::getTwoBitsAtN(unsigned char input, unsigned n) +{ + return (input & (0x03 << n)) >> n; +} + +unsigned char ByteUtils::getMBitsAtN(unsigned char input, unsigned m, unsigned n) +{ + switch (m) + { + case 1: + return (input & (0x01 << n)) >> n; + case 2: + return (input & (0x03 << n)) >> n; + case 3: + return (input & (0x07 << n)) >> n; + case 4: + return (input & (0x0F << n)) >> n; + case 5: + return (input & (0x1F << n)) >> n; + case 6: + return (input & (0x3F << n)) >> n; + case 7: + return (input & (0x7F << n)) >> n; + case 8: + return input; + default: + return 0; + } +} + +unsigned char ByteUtils::getBitN(unsigned char input, unsigned n) +{ + return input & (1 << n); +} + +unsigned char ByteUtils::getFromString(const std::string& string) +{ + unsigned char ret{0}; + + if (string.length() < 8) + { + return ret; + } + + for(unsigned idx=0; idx<8; idx++) + { + if (string[idx] == '1') + { + ret |= (0x01 << (7 - idx)); + } + } + return ret; +} + +std::string ByteUtils::toString(unsigned char c) +{ + std::string ret; + for(unsigned idx=0; idx<8; idx++) + { + ret += getBitN(c, 7 - idx) ? '1' : '0'; + } + return ret; +} + +void ByteUtils::ReverseBuffer(char* buffer, char* reverse, unsigned size, unsigned targetSize) +{ + for(unsigned idx=0; idx(buffer, reverse); +} + +ByteUtils::DWord ByteUtils::ToDWord(char* buffer, bool reverse) +{ + return ToType(buffer, reverse); +} + +ByteUtils::QWord ByteUtils::ToQWord(char* buffer, bool reverse) +{ + return ToType(buffer, reverse); +} + +bool ByteUtils::Compare(char* buffer, const char* tag, unsigned size) +{ + for(unsigned idx=0; idx #include +#include class ByteUtils { @@ -10,75 +11,27 @@ public: using DWord = int32_t; using QWord = int64_t; - static bool MostSignificantBitIsOne(char c) - { - return c & (1 << 7); - } + static bool MostSignificantBitIsOne(char c); - static Word GetWordFirstBit(const Word word) - { - return word & ByteUtils::WORD_FIRST_BIT; - }; + static Word GetWordFirstBit(const Word word); - static Word GetWordLastByte(const Word word) - { - return word & ByteUtils::WORD_LAST_BYTE; - } + static Word GetWordLastByte(const Word word); - static unsigned char getHigherNBits(unsigned char input, unsigned num) - { - return input >> 8 - num; - } + static unsigned char getHigherNBits(unsigned char input, unsigned num); - static unsigned char getLowerNBits(unsigned char input, unsigned num) - { - switch (num) - { - case 1: - return input & 0x01; - case 2: - return input & 0x03; - case 3: - return input & 0x07; - case 4: - return input & 0x0F; - case 5: - return input & 0x1F; - case 6: - return input & 0x3F; - case 7: - return input & 0x7F; - case 8: - return input; - default: - return 0; - } - } + static unsigned char getLowerNBits(unsigned char input, unsigned num); - static unsigned char getTwoBitsAtN(unsigned char input, unsigned n) - { - return (input & (0x03 << n)) >> n; - } + static unsigned char getTwoBitsAtN(unsigned char input, unsigned n); - static unsigned char getBitN(unsigned char input, unsigned n) - { - return input & (1 << n); - } + static unsigned char getMBitsAtN(unsigned char input, unsigned m, unsigned n); - static void ReverseBuffer(char* buffer, char* reverse, unsigned size, unsigned targetSize) - { - for(unsigned idx=0; idx static T ToType(char* buffer, bool reverse = true) @@ -97,42 +50,17 @@ public: return result; } - static Word ToWord(char* buffer, bool reverse = true) - { - return ToType(buffer, reverse); - } + static Word ToWord(char* buffer, bool reverse = true); - static DWord ToDWord(char* buffer, bool reverse = true) - { - return ToType(buffer, reverse); - } + static DWord ToDWord(char* buffer, bool reverse = true); - static QWord ToQWord(char* buffer, bool reverse = true) - { - return ToType(buffer, reverse); - } + static QWord ToQWord(char* buffer, bool reverse = true); - static bool Compare(char* buffer, const char* tag, unsigned size) - { - for(unsigned idx=0; idx 0) + { + unsigned char last_byte = mCurrentByte; + if (!loadNextByte()) + { + return false; + } + + auto num_lower = 7 - mBitOffset; + char lower_bits = ByteUtils::getHigherNBits(last_byte, num_lower); + char higher_bits = ByteUtils::getLowerNBits(mCurrentByte, overshoot); + + buffer = (higher_bits << (8 - num_lower)) | (lower_bits >> mBitOffset); + + mBitOffset = overshoot; + return true; + } + else + { + buffer = ByteUtils::getMBitsAtN(mCurrentByte, n, mBitOffset); + mBitOffset += n; + return true; + } +} + +void BitStream::setByte(unsigned idx, unsigned char data) +{ + mBuffer[idx] = data; +} + +void BitStream::setBufferSize(std::size_t size) +{ + mBuffer = std::vector(size); +} diff --git a/src/core/streams/BitStream.h b/src/core/streams/BitStream.h new file mode 100644 index 0000000..c8193e5 --- /dev/null +++ b/src/core/streams/BitStream.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +class BitStream +{ +public: + bool getNextNBits(unsigned n, unsigned char& buffer); + + bool loadNextByte(); + + void setByte(unsigned idx, unsigned char data); + + void setBufferSize(std::size_t size); + + unsigned char getCurrentByte() const + { + return mCurrentByte; + } + +private: + unsigned mByteOffset{0}; + unsigned mBitOffset{0}; + + char mCurrentByte{0}; + std::vector mBuffer; +}; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2ec7d73..919d76f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -10,6 +10,7 @@ target_include_directories(test_utils PUBLIC list(APPEND TestFiles audio/TestAudioWriter.cpp audio/TestMidiReader.cpp + core/TestByteUtils.cpp core/TestBinaryStream.cpp core/TestTomlReader.cpp compiler/TestLexer.cpp diff --git a/test/compression/TestStreamCompressor.cpp b/test/compression/TestStreamCompressor.cpp index cf62548..323a80e 100644 --- a/test/compression/TestStreamCompressor.cpp +++ b/test/compression/TestStreamCompressor.cpp @@ -2,6 +2,7 @@ #include "HuffmanEncoder.h" #include "RunLengthEncoder.h" +#include "Lz77Encoder.h" void test_run_length_encoder() { @@ -34,13 +35,30 @@ void test_huffman_encoder() HuffmanEncoder encoder; encoder.encode(counts); - } +void test_lz77_encoder() +{ + std::string test_data = "sir sid eastman easily teases sea sick seals"; + //std::string test_data = "sir sid eastman"; + + Lz77Encoder encoder; + auto encoded = encoder.encode(test_data); + + std::cout << "Encoded: " << encoded << std::endl; + + //auto decoded = encoder.decode(encoded); + + //std::cout << "Decoded: " << decoded << std::endl; +} + + int main() { - test_huffman_encoder(); + //test_huffman_encoder(); //test_run_length_encoder(); + + test_lz77_encoder(); return 0; } diff --git a/test/core/TestByteUtils.cpp b/test/core/TestByteUtils.cpp new file mode 100644 index 0000000..ae88e84 --- /dev/null +++ b/test/core/TestByteUtils.cpp @@ -0,0 +1,18 @@ +#include "ByteUtils.h" + +#include + +int main() +{ + auto byte = ByteUtils::getFromString("00110101"); + std::cout << "Value is " << static_cast(byte) << std::endl; + + auto string_rep = ByteUtils::toString(byte); + std::cout << "String rep is " << string_rep << std::endl; + + auto slice = ByteUtils::getMBitsAtN(byte, 3, 3); + std::cout << "Slice is " << ByteUtils::toString(slice) << std::endl; + + + return 0; +} diff --git a/test/core/TestTomlReader.cpp b/test/core/TestTomlReader.cpp index 4517ea6..9802cf6 100644 --- a/test/core/TestTomlReader.cpp +++ b/test/core/TestTomlReader.cpp @@ -5,17 +5,17 @@ int main() { - const auto data_loc = std::filesystem::path(__FILE__) / "../../data"; - const auto sample_toml_file = data_loc / "sample_toml.toml"; + const auto data_loc = std::filesystem::path(__FILE__) / "../../data"; + const auto sample_toml_file = data_loc / "sample_toml.toml"; - auto reader = TomlReader(); - reader.read(sample_toml_file); + auto reader = TomlReader(); + reader.read(sample_toml_file); - auto themes_table = reader.getContent()->getTable("themes"); - for (const auto& items : themes_table->getKeyValuePairs()) - { - std::cout << "Got entry with key: " << items.first << " and val " << items.second << std::endl; - } + auto themes_table = reader.getContent()->getTable("themes"); + for (const auto& items : themes_table->getKeyValuePairs()) + { + std::cout << "Got entry with key: " << items.first << " and val " << items.second << std::endl; + } - return 0; -} \ No newline at end of file + return 0; +}