Add some bit utils and initial l77 encoder.

2022-11-22 17:37:06 +00:00 · 2022-11-22 17:37:06 +00:00 · 318b481ccc
commit 318b481ccc
parent ff962a6b16
12 changed files with 508 additions and 117 deletions
--- a/src/compression/Lz77Encoder.cpp
+++ b/src/compression/Lz77Encoder.cpp
--- a/src/compression/Lz77Encoder.h
+++ b/src/compression/Lz77Encoder.h
@ -0,0 +1,168 @@
+#pragma once
+
+#include "StringUtils.h"
+
+#include <string>
+#include <vector>
+
+class Lz77Encoder
+{
+public:
+    using DataStream = std::vector<char>;
+
+    unsigned lookAheadForMatchingChars(std::vector<char>& matchBuffer, unsigned searchIndex, unsigned hitOffset, const std::string& stream, unsigned streamLoc)
+    {
+        auto remaining_size = stream.size() - streamLoc;
+
+        unsigned num_hits{1};
+        for (unsigned jdx=1; jdx< remaining_size; jdx++)
+        {
+            char buffer_char{0};
+            if (searchIndex + jdx < mSearchBuffer.size())
+            {
+                buffer_char = mSearchBuffer[searchIndex + jdx];
+            }
+            else
+            {
+                buffer_char = stream[jdx - hitOffset];
+            }
+
+            auto lookahead_char = stream[streamLoc + jdx];
+            if (lookahead_char == buffer_char)
+            {
+                matchBuffer.push_back(buffer_char);
+                num_hits++;
+            }
+            else
+            {
+                break;
+            }
+        }
+        return num_hits;
+    }
+
+    void lookThroughSearchBuffer(char searchChar, unsigned& hitLength, unsigned& hitOffset, const std::string& stream, unsigned streamLoc)
+    {
+        for(unsigned idx=0; idx<mSearchBuffer.size(); idx++)
+        {
+            auto search_index = mSearchBuffer.size() - idx - 1;
+
+            if (auto buffer_char = mSearchBuffer[search_index]; buffer_char == searchChar)
+            {
+                std::vector<char> match_buffer{buffer_char};
+                auto num_hits = lookAheadForMatchingChars(match_buffer, search_index, idx, stream, streamLoc);
+
+                if (num_hits >= hitLength)
+                {
+                    hitLength = num_hits;
+                    hitOffset = idx + 1;
+                }
+            }
+        }
+    }
+
+    std::string encode(const std::string& stream)
+    {
+        unsigned loc{0};
+        std::string ret;
+
+        while(loc < stream.size())
+        {
+            auto search_char = stream[loc];
+
+            unsigned hit_length{0};
+            unsigned hit_offset{0};
+            lookThroughSearchBuffer(search_char, hit_length, hit_offset, stream, loc);
+
+            if (hit_length > 0)
+            {
+                ret += "@" + std::to_string(hit_offset) + "L" + std::to_string(hit_length);
+                loc+=hit_length;
+
+                auto hit_loc = mSearchBuffer.size() - hit_offset;
+                for(unsigned idx=hit_loc; idx<hit_loc + hit_length; idx++)
+                {
+                    mSearchBuffer.push_back(mSearchBuffer[idx]);
+                }
+            }
+            else
+            {
+                ret += search_char;
+                mSearchBuffer.push_back(search_char);
+                loc++;
+            }
+        }
+
+        return ret;
+    }
+
+    std::string decode(const std::string& stream)
+    {
+        std::string ret;
+
+        unsigned loc{0};
+        while(loc < stream.size())
+        {
+            auto working_char = stream[loc];
+            if (working_char == '@')
+            {
+                unsigned loc_working = loc;
+
+                auto remainder = stream.size() - loc;
+                std::string offset;
+
+                unsigned length_loc{0};
+                for(unsigned jdx=0; jdx< remainder; jdx++)
+                {
+                    loc++;
+
+                    auto offset_char = stream[loc];
+                    if (offset_char == 'L')
+                    {
+                        loc++;
+                        break;
+                    }
+                    else
+                    {
+                        offset += offset_char;
+                    }
+                }
+                unsigned offset_amount = std::stoul(offset);
+
+                std::string length;
+                remainder = stream.size() - loc;
+
+                for(unsigned jdx=0; jdx< remainder; jdx++)
+                {
+                    auto length_char = stream[loc];
+                    if (StringUtils::IsAlphabetical(length_char) || length_char == '@')
+                    {
+                        break;
+                    }
+                    else
+                    {
+                        loc++;
+                        length += length_char;
+                    }
+                }
+
+                unsigned length_amount = std::stoul(length);
+
+                auto buffer_index = ret.size() - offset_amount;
+                for(unsigned jdx=buffer_index;jdx<buffer_index+length_amount; jdx++)
+                {
+                    ret += ret[jdx];
+                }
+            }
+            else
+            {
+                loc++;
+                ret += working_char;
+            }
+        }
+        return ret;
+    }
+
+    DataStream mSearchBuffer;
+    DataStream mLookaheadBuffer;
+};
--- a/src/compression/ZlibData.h
+++ b/src/compression/ZlibData.h
@ -1,6 +1,7 @@
 #pragma once

 #include "ByteUtils.h"
+#include "BitStream.h"

 #include <vector>
 #include <iostream>
@ -10,12 +11,12 @@ class ZlibData
 public:
    void setByte(unsigned idx, unsigned char data)
    {
-        mData[idx] = data;
+        mBitStream.setByte(idx, data);
    }

    void setDataSize(std::size_t size)
    {
-        mData = std::vector<unsigned char>(size);
+        mBitStream.setBufferSize(size);
    }

    void setCompressionMethod(unsigned char method)
@ -48,22 +49,22 @@ public:
        unsigned char ERROR = 0x03;

        bool in_final_block = false;
-        unsigned working_byte_id = 0;
-        for (unsigned idx=0; idx<mData.size(); idx++)
+        while(mBitStream.loadNextByte())
        {
-            auto working_byte = mData[working_byte_id];
-            std::cout << "Into process data, byte is: " << static_cast<int>(working_byte) << std::endl;
+            auto working_byte = mBitStream.getCurrentByte();
+            std::cout << "Into process data, byte is: " << static_cast<unsigned>(working_byte) << std::endl;

-            auto final_block = ByteUtils::getBitN(working_byte, 0);
+            unsigned char final_block{0};
+            mBitStream.getNextNBits(1, final_block);
            if (final_block)
            {
                std::cout << "Got final block" << std::endl;
                in_final_block = true;
            }

-            auto compress_type = ByteUtils::getTwoBitsAtN(working_byte, 1);
-            std::cout << "Compress type byte is: " << static_cast<int>(compress_type) << std::endl;
-
+            unsigned char compress_type{0};
+            mBitStream.getNextNBits(2, compress_type);
+            std::cout << "Compress type byte is: " << static_cast<unsigned>(compress_type) << std::endl;
            if (compress_type == NO_COMPRESSION)
            {
                std::cout << "Got NO_COMPRESSION" << std::endl;
@ -75,6 +76,12 @@ public:
            else if (compress_type == DYNAMIC_HUFFMAN)
            {
                std::cout << "Got DYNAMIC_HUFFMAN" << std::endl;
+
+                unsigned char h_list{0};
+                mBitStream.getNextNBits(5, h_list);
+                mHlist = h_list + 257;
+                std::cout << "Got HLIST " << mHlist << std::endl;
+
            }
            else if (compress_type == ERROR)
            {
@ -85,7 +92,10 @@ public:
    }

 private:
-    std::vector<unsigned char> mData;
+    BitStream mBitStream;
+
+    unsigned mHlist{0};
+
    unsigned char mCmf{0};
    unsigned char mFlg{0};
    unsigned char mCompressionMethod{0};
--- a/src/core/ByteUtils.cpp
+++ b/src/core/ByteUtils.cpp
@ -0,0 +1,163 @@
+#include "ByteUtils.h"
+
+
+bool ByteUtils::MostSignificantBitIsOne(char c)
+{
+    return c & (1 << 7);
+}
+
+ByteUtils::Word ByteUtils::GetWordFirstBit(const Word word)
+{
+    return word & ByteUtils::WORD_FIRST_BIT;
+};
+
+ByteUtils::Word ByteUtils::GetWordLastByte(const Word word)
+{
+    return word & ByteUtils::WORD_LAST_BYTE;
+}
+
+unsigned char ByteUtils::getHigherNBits(unsigned char input, unsigned num)
+{
+    return input >> 8 - num;
+}
+
+unsigned char ByteUtils::getLowerNBits(unsigned char input, unsigned num)
+{
+    switch (num)
+    {
+    case 1:
+        return input & 0x01;
+    case 2:
+        return input & 0x03;
+    case 3:
+        return input & 0x07;
+    case 4:
+        return input & 0x0F;
+    case 5:
+        return input & 0x1F;
+    case 6:
+        return input & 0x3F;
+    case 7:
+        return input & 0x7F;
+    case 8:
+        return input;
+    default:
+        return 0;
+    }
+}
+
+unsigned char ByteUtils::getTwoBitsAtN(unsigned char input, unsigned n)
+{
+    return (input & (0x03 << n)) >> n;
+}
+
+unsigned char ByteUtils::getMBitsAtN(unsigned char input, unsigned m, unsigned n)
+{
+    switch (m)
+    {
+    case 1:
+        return (input & (0x01 << n)) >> n;
+    case 2:
+        return (input & (0x03 << n)) >> n;
+    case 3:
+        return (input & (0x07 << n)) >> n;
+    case 4:
+        return (input & (0x0F << n)) >> n;
+    case 5:
+        return (input & (0x1F << n)) >> n;
+    case 6:
+        return (input & (0x3F << n)) >> n;
+    case 7:
+        return (input & (0x7F << n)) >> n;
+    case 8:
+        return input;
+    default:
+        return 0;
+    }
+}
+
+unsigned char ByteUtils::getBitN(unsigned char input, unsigned n)
+{
+    return input & (1 << n);
+}
+
+unsigned char ByteUtils::getFromString(const std::string& string)
+{
+    unsigned char ret{0};
+
+    if (string.length() < 8)
+    {
+        return ret;
+    }
+
+    for(unsigned idx=0; idx<8; idx++)
+    {
+        if (string[idx] == '1')
+        {
+            ret |= (0x01 << (7 - idx));
+        }
+    }
+    return ret;
+}
+
+std::string ByteUtils::toString(unsigned char c)
+{
+    std::string ret;
+    for(unsigned idx=0; idx<8; idx++)
+    {
+        ret += getBitN(c, 7 - idx) ? '1' : '0';
+    }
+    return ret;
+}
+
+void ByteUtils::ReverseBuffer(char* buffer, char* reverse, unsigned size, unsigned targetSize)
+{
+    for(unsigned idx=0; idx<targetSize; idx++)
+    {
+        if (idx < size)
+        {
+            reverse[idx] = buffer[size - 1 -idx];
+        }
+        else
+        {
+            reverse[idx] = 0;
+        }
+    }
+}
+
+ByteUtils::Word ByteUtils::ToWord(char* buffer, bool reverse)
+{
+    return ToType<Word>(buffer, reverse);
+}
+
+ByteUtils::DWord ByteUtils::ToDWord(char* buffer, bool reverse)
+{
+    return ToType<DWord>(buffer, reverse);
+}
+
+ByteUtils::QWord ByteUtils::ToQWord(char* buffer, bool reverse)
+{
+    return ToType<QWord>(buffer, reverse);
+}
+
+bool ByteUtils::Compare(char* buffer, const char* tag, unsigned size)
+{
+    for(unsigned idx=0; idx<size; idx++)
+    {
+        if(tag[idx] != buffer[idx])
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool ByteUtils::CompareDWords(char* buffer, const char* tag)
+{
+    return Compare(buffer, tag, sizeof(DWord));
+}
+
+bool ByteUtils::CompareWords(char* buffer, const char* tag)
+{
+    return Compare(buffer, tag, sizeof(Word));
+}
--- a/src/core/ByteUtils.h
+++ b/src/core/ByteUtils.h
@ -2,6 +2,7 @@

 #include <cstring>
 #include <stdint.h>
+#include <string>

 class ByteUtils
 {
@ -10,75 +11,27 @@ public:
    using DWord = int32_t;
    using QWord = int64_t;

-    static bool MostSignificantBitIsOne(char c)
-    {
-        return c & (1 << 7);
-    }
+    static bool MostSignificantBitIsOne(char c);

-    static Word GetWordFirstBit(const Word word)
-    {
-        return word & ByteUtils::WORD_FIRST_BIT;
-    };
+    static Word GetWordFirstBit(const Word word);

-    static Word GetWordLastByte(const Word word)
-    {
-        return word & ByteUtils::WORD_LAST_BYTE;
-    }
+    static Word GetWordLastByte(const Word word);

-    static unsigned char getHigherNBits(unsigned char input, unsigned num)
-    {
-        return input >> 8 - num;
-    }
+    static unsigned char getHigherNBits(unsigned char input, unsigned num);

-    static unsigned char getLowerNBits(unsigned char input, unsigned num)
-    {
-        switch (num)
-        {
-        case 1:
-            return input & 0x01;
-        case 2:
-            return input & 0x03;
-        case 3:
-            return input & 0x07;
-        case 4:
-            return input & 0x0F;
-        case 5:
-            return input & 0x1F;
-        case 6:
-            return input & 0x3F;
-        case 7:
-            return input & 0x7F;
-        case 8:
-            return input;
-        default:
-            return 0;
-        }
-    }
+    static unsigned char getLowerNBits(unsigned char input, unsigned num);

-    static unsigned char getTwoBitsAtN(unsigned char input, unsigned n)
-    {
-        return (input & (0x03 << n)) >> n;
-    }
+    static unsigned char getTwoBitsAtN(unsigned char input, unsigned n);

-    static unsigned char getBitN(unsigned char input, unsigned n)
-    {
-        return input & (1 << n);
-    }
+    static unsigned char getMBitsAtN(unsigned char input, unsigned m, unsigned n);

-    static void ReverseBuffer(char* buffer, char* reverse, unsigned size, unsigned targetSize)
-    {
-        for(unsigned idx=0; idx<targetSize; idx++)
-        {
-            if (idx < size)
-            {
-                reverse[idx] = buffer[size - 1 -idx];
-            }
-            else
-            {
-                reverse[idx] = 0;
-            }
-        }
-    }
+    static unsigned char getBitN(unsigned char input, unsigned n);
+
+    static unsigned char getFromString(const std::string& string);
+
+    static std::string toString(unsigned char c);
+
+    static void ReverseBuffer(char* buffer, char* reverse, unsigned size, unsigned targetSize);

    template<typename T>
    static T ToType(char* buffer, bool reverse = true)
@ -97,42 +50,17 @@ public:
        return result;
    }

-    static Word ToWord(char* buffer, bool reverse = true)
-    {
-        return ToType<Word>(buffer, reverse);
-    }
+    static Word ToWord(char* buffer, bool reverse = true);

-    static DWord ToDWord(char* buffer, bool reverse = true)
-    {
-        return ToType<DWord>(buffer, reverse);
-    }
+    static DWord ToDWord(char* buffer, bool reverse = true);

-    static QWord ToQWord(char* buffer, bool reverse = true)
-    {
-        return ToType<QWord>(buffer, reverse);
-    }
+    static QWord ToQWord(char* buffer, bool reverse = true);

-    static bool Compare(char* buffer, const char* tag, unsigned size)
-    {
-        for(unsigned idx=0; idx<size; idx++)
-        {
-            if(tag[idx] != buffer[idx])
-            {
-                return false;
-            }
-        }
-        return true;
-    }
+    static bool Compare(char* buffer, const char* tag, unsigned size);

-    static bool CompareDWords(char* buffer, const char* tag)
-    {
-        return Compare(buffer, tag, sizeof(DWord));
-    }
+    static bool CompareDWords(char* buffer, const char* tag);

-    static bool CompareWords(char* buffer, const char* tag)
-    {
-        return Compare(buffer, tag, sizeof(Word));
-    }
+    static bool CompareWords(char* buffer, const char* tag);

    static const int BYTE_FIRST_BIT = 0x40; // 1000 0000
    static const Word WORD_FIRST_BIT = 0x8000; // 1000 0000 - 0000 0000
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@ -14,6 +14,7 @@ list(APPEND core_HEADERS
 )   

 list(APPEND core_LIB_INCLUDES 
+    ByteUtils.cpp
    Event.cpp
    Dictionary.cpp
    Color.cpp
@ -28,6 +29,7 @@ list(APPEND core_LIB_INCLUDES
    RandomUtils.cpp
    StringUtils.cpp
    streams/BinaryStream.cpp
+    streams/BitStream.cpp
    http/HttpResponse.cpp
    http/HttpHeader.cpp
    http/HttpRequest.cpp
--- a/src/core/streams/BitStream.cpp
+++ b/src/core/streams/BitStream.cpp
@ -0,0 +1,56 @@
+#include "BitStream.h"
+
+#include "ByteUtils.h"
+
+bool BitStream::loadNextByte()
+{
+    if (mByteOffset + 1 == mBuffer.size())
+    {
+        return false;
+    }
+    else
+    {
+        mByteOffset++;
+        mCurrentByte = mBuffer[mByteOffset];
+        return true;
+    }
+}
+
+bool BitStream::getNextNBits(unsigned n, unsigned char& buffer)
+{
+    int overshoot = n + mBitOffset - 7;
+
+    if (overshoot > 0)
+    {
+        unsigned char last_byte = mCurrentByte;
+        if (!loadNextByte())
+        {
+            return false;
+        }
+
+        auto num_lower = 7 - mBitOffset;
+        char lower_bits = ByteUtils::getHigherNBits(last_byte, num_lower);
+        char higher_bits = ByteUtils::getLowerNBits(mCurrentByte, overshoot);
+
+        buffer = (higher_bits << (8 - num_lower)) | (lower_bits >> mBitOffset);
+
+        mBitOffset = overshoot;
+        return true;
+    }
+    else
+    {
+        buffer = ByteUtils::getMBitsAtN(mCurrentByte, n, mBitOffset);
+        mBitOffset += n;
+        return true;
+    }
+}
+
+void BitStream::setByte(unsigned idx, unsigned char data)
+{
+    mBuffer[idx] = data;
+}
+
+void BitStream::setBufferSize(std::size_t size)
+{
+    mBuffer = std::vector<unsigned char>(size);
+}
--- a/src/core/streams/BitStream.h
+++ b/src/core/streams/BitStream.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include <vector>
+
+class BitStream
+{
+public:
+    bool getNextNBits(unsigned n, unsigned char& buffer);
+
+    bool loadNextByte();
+
+    void setByte(unsigned idx, unsigned char data);
+
+    void setBufferSize(std::size_t size);
+
+    unsigned char getCurrentByte() const
+    {
+        return mCurrentByte;
+    }
+
+private:
+    unsigned mByteOffset{0};
+    unsigned mBitOffset{0};
+
+    char mCurrentByte{0};
+    std::vector<unsigned char> mBuffer;
+};
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -10,6 +10,7 @@ target_include_directories(test_utils PUBLIC
 list(APPEND TestFiles 
            audio/TestAudioWriter.cpp
            audio/TestMidiReader.cpp
+            core/TestByteUtils.cpp
            core/TestBinaryStream.cpp
            core/TestTomlReader.cpp
            compiler/TestLexer.cpp
--- a/test/compression/TestStreamCompressor.cpp
+++ b/test/compression/TestStreamCompressor.cpp
@ -2,6 +2,7 @@

 #include "HuffmanEncoder.h"
 #include "RunLengthEncoder.h"
+#include "Lz77Encoder.h"

 void test_run_length_encoder()
 {
@ -34,13 +35,30 @@ void test_huffman_encoder()

    HuffmanEncoder encoder;
    encoder.encode(counts);
-
 }

+void test_lz77_encoder()
+{
+    std::string test_data = "sir sid eastman easily teases sea sick seals";
+    //std::string test_data = "sir sid eastman";
+
+    Lz77Encoder encoder;
+    auto encoded = encoder.encode(test_data);
+
+    std::cout << "Encoded: " << encoded << std::endl;
+
+    //auto decoded = encoder.decode(encoded);
+
+    //std::cout << "Decoded: " << decoded << std::endl;
+}
+
+
 int main()
 {
-    test_huffman_encoder();
+    //test_huffman_encoder();

    //test_run_length_encoder();
+
+    test_lz77_encoder();
    return 0;
 }
--- a/test/core/TestByteUtils.cpp
+++ b/test/core/TestByteUtils.cpp
@ -0,0 +1,18 @@
+#include "ByteUtils.h"
+
+#include <iostream>
+
+int main()
+{
+    auto byte = ByteUtils::getFromString("00110101");
+    std::cout << "Value is " << static_cast<unsigned>(byte) << std::endl;
+
+    auto string_rep = ByteUtils::toString(byte);
+    std::cout << "String rep is " << string_rep << std::endl;
+
+    auto slice = ByteUtils::getMBitsAtN(byte, 3, 3);
+    std::cout << "Slice is " << ByteUtils::toString(slice) << std::endl;
+
+
+    return 0;
+}
--- a/test/core/TestTomlReader.cpp
+++ b/test/core/TestTomlReader.cpp
@ -5,17 +5,17 @@

 int main()
 {
-	const auto data_loc = std::filesystem::path(__FILE__) / "../../data";
-	const auto sample_toml_file = data_loc / "sample_toml.toml";
+    const auto data_loc = std::filesystem::path(__FILE__) / "../../data";
+    const auto sample_toml_file = data_loc / "sample_toml.toml";

-	auto reader = TomlReader();
-	reader.read(sample_toml_file);
+    auto reader = TomlReader();
+    reader.read(sample_toml_file);

-	auto themes_table = reader.getContent()->getTable("themes");
-	for (const auto& items : themes_table->getKeyValuePairs())
-	{
-		std::cout << "Got entry with key: " << items.first << " and val " << items.second << std::endl;
-	}
+    auto themes_table = reader.getContent()->getTable("themes");
+    for (const auto& items : themes_table->getKeyValuePairs())
+    {
+        std::cout << "Got entry with key: " << items.first << " and val " << items.second << std::endl;
+    }

-	return 0;
-}
+    return 0;
+}