Some markdown processing.

This commit is contained in:
James Grogan 2022-12-01 17:13:54 +00:00
parent 31b479e9f6
commit ec11529b9a
23 changed files with 677 additions and 135 deletions

View file

@ -1,8 +1,15 @@
#include "DocumentConverter.h" #include "DocumentConverter.h"
#include "MarkdownParser.h" #include "MarkdownParser.h"
#include "MarkdownDocument.h"
#include "MarkdownConverter.h"
#include "HtmlDocument.h"
#include "HtmlWriter.h" #include "HtmlWriter.h"
#include "FileLogger.h" #include "FileLogger.h"
#include "File.h" #include "File.h"
#include <fstream> #include <fstream>
DocumentConverter::DocumentConverter() DocumentConverter::DocumentConverter()
@ -55,19 +62,14 @@ void DocumentConverter::markdownToHtml(File* input, File* output)
input->open(File::AccessMode::Read); input->open(File::AccessMode::Read);
MarkdownParser parser; MarkdownParser parser;
auto md_doc = parser.run(input->readText());
auto handle = input->getInHandle();
while(handle->good())
{
std::string line;
std::getline(*handle, line);
parser.processLine(line);
};
input->close(); input->close();
auto html_document = parser.getHtml(); MarkdownConverter converter;
auto html_doc = converter.convert(md_doc.get());
HtmlWriter writer; HtmlWriter writer;
std::string html_string = writer.toString(html_document.get()); std::string html_string = writer.toString(html_doc.get());
output->open(File::AccessMode::Write); output->open(File::AccessMode::Write);
*(output->getOutHandle()) << html_string; *(output->getOutHandle()) << html_string;

View file

@ -12,6 +12,9 @@ list(APPEND web_LIB_INCLUDES
xml/xml-elements/XmlProlog.h xml/xml-elements/XmlProlog.h
xml/xml-elements/XmlProlog.cpp xml/xml-elements/XmlProlog.cpp
markdown/MarkdownParser.cpp markdown/MarkdownParser.cpp
markdown/MarkdownConverter.cpp
markdown/MarkdownDocument.h
markdown/MarkdownDocument.cpp
html/HtmlWriter.cpp html/HtmlWriter.cpp
html/HtmlDocument.cpp html/HtmlDocument.cpp
html/HtmlElement.cpp html/HtmlElement.cpp

View file

@ -23,3 +23,11 @@ std::unique_ptr<HtmlDocument> HtmlDocument::Create()
{ {
return std::make_unique<HtmlDocument>(); return std::make_unique<HtmlDocument>();
} }
void HtmlDocument::addElementToBody(std::unique_ptr<HtmlElement> element)
{
if (auto body_element = getRoot()->getFirstChildWithTagName("body"))
{
body_element->addChild(std::move(element));
}
}

View file

@ -4,6 +4,8 @@
#include <memory> #include <memory>
class HtmlElement;
class HtmlDocument : public XmlDocument class HtmlDocument : public XmlDocument
{ {
public: public:
@ -12,6 +14,10 @@ public:
virtual ~HtmlDocument() = default; virtual ~HtmlDocument() = default;
static std::unique_ptr<HtmlDocument> Create(); static std::unique_ptr<HtmlDocument> Create();
void addElementToBody(std::unique_ptr<HtmlElement> element);
private:
}; };
using HtmlDocumentPtr = std::unique_ptr<HtmlDocument>; using HtmlDocumentPtr = std::unique_ptr<HtmlDocument>;

View file

@ -5,8 +5,3 @@ HtmlElement::HtmlElement(const std::string& tagName)
{ {
} }
std::unique_ptr<HtmlElement> HtmlElement::CreateUnique(const std::string& tagName)
{
return std::make_unique<HtmlElement>(tagName);
}

View file

@ -6,9 +6,48 @@
class HtmlElement : public XmlElement class HtmlElement : public XmlElement
{ {
public: public:
enum class Type
{
NONE,
BODY,
HEAD,
PARAGRAPH,
TEXT_RUN,
CODE,
HEADING
};
HtmlElement(const std::string& tagName); HtmlElement(const std::string& tagName);
static std::unique_ptr<HtmlElement> CreateUnique(const std::string& tagName); virtual Type getType() const = 0;
};
class HtmlCodeElement : public HtmlElement
{
public:
HtmlCodeElement() : HtmlElement("code")
{
}
Type getType() const override
{
return Type::CODE;
}
};
class HtmlHeadingElement : public HtmlElement
{
public:
HtmlHeadingElement(unsigned index) : HtmlElement("h" + std::to_string(index))
{
}
Type getType() const override
{
return Type::HEADING;
}
}; };
using HtmlElementUPtr = std::unique_ptr<HtmlElement>; using HtmlElementUPtr = std::unique_ptr<HtmlElement>;

View file

@ -0,0 +1,23 @@
#pragma once
#include "HtmlElement.h"
class HtmlTextRun : public HtmlElement
{
public:
HtmlTextRun() : HtmlElement("NONE_HtmlTextRun")
{
}
Type getType() const override
{
return Type::TEXT_RUN;
}
std::string toString(unsigned depth = 0) const override
{
const auto prefix = std::string(2*depth, ' ');
return prefix + getText();
}
};

View file

@ -11,57 +11,12 @@ HtmlWriter::HtmlWriter()
} }
std::string HtmlWriter::toString(XmlElement* element, unsigned depth)
{
const auto prefix = std::string(2*depth, ' ');
auto content = prefix + "<" + element->getTagName();
for (std::size_t idx=0; idx< element->getNumAttributes(); idx++)
{
auto attribute = element->getAttribute(idx);
content += " " + attribute->getName() + "=\"" + attribute->getValue() + "\"";
}
const auto num_children = element->getNumChildren();
if (num_children == 0 && element->getText().empty())
{
content += "/>\n";
return content;
}
else
{
content += ">";
}
if (!element->getText().empty())
{
content += element->getText();
}
if (num_children>0)
{
content += "\n";
}
for (std::size_t idx=0; idx< element->getNumChildren(); idx++)
{
auto child = element->getChild(idx);
content += toString(child, depth+1);
}
if (num_children>0)
{
content += prefix;
}
content += "</" + element->getTagName() + ">\n";
return content;
}
std::string HtmlWriter::toString(HtmlDocument* document) std::string HtmlWriter::toString(HtmlDocument* document)
{ {
std::string content = "<!DOCTYPE html>\n"; std::string content = "<!DOCTYPE html>\n";
if (auto root = document->getRoot()) if (auto root = document->getRoot())
{ {
content += toString(root); content += root->toString();
} }
return content; return content;
} }

View file

@ -3,7 +3,6 @@
#include <string> #include <string>
class HtmlDocument; class HtmlDocument;
class XmlElement;
class HtmlWriter class HtmlWriter
{ {
@ -11,7 +10,4 @@ public:
HtmlWriter(); HtmlWriter();
std::string toString(HtmlDocument* document); std::string toString(HtmlDocument* document);
private:
std::string toString(XmlElement* element, unsigned depth=0);
}; };

View file

@ -9,4 +9,9 @@ public:
{ {
} }
Type getType() const override
{
return Type::BODY;
}
}; };

View file

@ -9,4 +9,9 @@ public:
{ {
} }
Type getType() const override
{
return Type::HEAD;
}
}; };

View file

@ -0,0 +1,18 @@
#pragma once
#include "HtmlElement.h"
class HtmlParagraphElement : public HtmlElement
{
public:
HtmlParagraphElement() : HtmlElement("p")
{
}
Type getType() const override
{
return Type::PARAGRAPH;
}
};

View file

@ -0,0 +1,59 @@
#include "MarkdownConverter.h"
#include "HtmlDocument.h"
#include "HtmlElement.h"
#include "HtmlParagraphElement.h"
#include "HtmlTextRun.h"
#include "MarkdownDocument.h"
std::unique_ptr<HtmlDocument> MarkdownConverter::convert(MarkdownDocument* markdownDoc) const
{
auto html_doc = std::make_unique<HtmlDocument>();
for(unsigned idx=0; idx<markdownDoc->getNumElements();idx++)
{
auto md_element = markdownDoc->getElement(idx);
if (md_element->getType() == MarkdownElement::Type::HEADING)
{
auto heading_level = dynamic_cast<MarkdownHeading*>(md_element)->getLevel();
auto html_element = std::make_unique<HtmlHeadingElement>(heading_level);
html_element->setText(md_element->getTextContent());
html_doc->addElementToBody(std::move(html_element));
}
else if(md_element->getType() == MarkdownElement::Type::PARAGRAPH)
{
auto html_p_element = std::make_unique<HtmlParagraphElement>();
auto para_element = dynamic_cast<MarkdownParagraph*>(md_element);
for(unsigned idx=0; idx< para_element->getNumChildren(); idx++)
{
auto child = para_element->getChild(idx);
if (child->getType() == MarkdownElement::Type::INLINE_QUOTE)
{
auto html_quote = std::make_unique<HtmlCodeElement>();
html_quote->setText(child->getTextContent());
html_p_element->addChild(std::move(html_quote));
}
else if(child->getType() == MarkdownElement::Type::TEXT_SPAN)
{
auto html_text = std::make_unique<HtmlTextRun>();
html_text->setText(child->getTextContent());
html_p_element->addChild(std::move(html_text));
}
}
html_doc->addElementToBody(std::move(html_p_element));
}
else if(md_element->getType() == MarkdownElement::Type::MULTILINE_QUOTE)
{
auto html_quote = std::make_unique<HtmlCodeElement>();
html_quote->setText(md_element->getTextContent());
html_doc->addElementToBody(std::move(html_quote));
}
}
return std::move(html_doc);
}

View file

@ -0,0 +1,13 @@
#pragma once
#include <memory>
class HtmlDocument;
class MarkdownDocument;
class MarkdownConverter
{
public:
std::unique_ptr<HtmlDocument> convert(MarkdownDocument* markdownDoc) const;
};

View file

View file

@ -0,0 +1,167 @@
#pragma once
#include <vector>
#include <memory>
class MarkdownElement
{
public:
enum class Type
{
HEADING,
PARAGRAPH,
TEXT_SPAN,
INLINE_CODE,
MULTILINE_CODE,
INLINE_QUOTE,
MULTILINE_QUOTE,
INLINE_SPECIAL,
MULTILINE_SPECIAL,
LINK,
IMAGE
};
virtual ~MarkdownElement() = default;
void appendTextContent(const std::string& content)
{
mTextContent += content;
}
const std::string& getTextContent() const
{
return mTextContent;
}
virtual Type getType() const = 0;
private:
std::string mTextContent;
};
class MarkdownInlineElement : public MarkdownElement
{
public:
virtual ~MarkdownInlineElement() = default;
};
class MarkdownTextSpan : public MarkdownInlineElement
{
public:
virtual ~MarkdownTextSpan() = default;
Type getType() const override
{
return Type::TEXT_SPAN;
}
};
class MarkdownParagraph : public MarkdownElement
{
public:
virtual ~MarkdownParagraph() = default;
Type getType() const override
{
return Type::PARAGRAPH;
}
void addChild(std::unique_ptr<MarkdownInlineElement> child)
{
mChildren.push_back(std::move(child));
}
std::size_t getNumChildren() const
{
return mChildren.size();
}
MarkdownInlineElement* getChild(std::size_t idx) const
{
return mChildren[idx].get();
}
std::vector<std::unique_ptr<MarkdownInlineElement> > mChildren;
};
class MarkdownHeading : public MarkdownElement
{
public:
MarkdownHeading(unsigned level)
: mLevel(level)
{
}
virtual ~MarkdownHeading() = default;
Type getType() const override
{
return Type::HEADING;
}
unsigned getLevel() const
{
return mLevel;
}
private:
unsigned mLevel{1};
};
class MarkdownInlineQuote : public MarkdownInlineElement
{
public:
virtual ~MarkdownInlineQuote() = default;
Type getType() const override
{
return Type::INLINE_QUOTE;
}
};
class MarkdownMultilineQuote : public MarkdownElement
{
public:
MarkdownMultilineQuote(const std::string& tag)
: mTag(tag)
{
}
virtual ~MarkdownMultilineQuote() = default;
Type getType() const override
{
return Type::MULTILINE_QUOTE;
}
private:
std::string mTag;
};
class MarkdownDocument
{
public:
void addElement(std::unique_ptr<MarkdownElement> element)
{
mElements.push_back(std::move(element));
}
std::size_t getNumElements() const
{
return mElements.size();
}
MarkdownElement* getElement(std::size_t idx) const
{
return mElements[idx].get();
}
private:
std::vector<std::unique_ptr<MarkdownElement> > mElements;
};

View file

@ -1,30 +1,231 @@
#include "MarkdownParser.h" #include "MarkdownParser.h"
#include "MarkdownDocument.h"
#include <sstream> #include <sstream>
#include <iostream> #include <iostream>
MarkdownParser::MarkdownParser() MarkdownParser::MarkdownParser()
: mHtmlDocument(HtmlDocument::Create())
{ {
} }
void MarkdownParser::processLine(const std::string& line) MarkdownParser::~MarkdownParser()
{ {
} }
void MarkdownParser::run(const std::string& content) void MarkdownParser::onMultilineQuote()
{ {
std::stringstream ss(content); std::cout << "Adding multiline quote " << mDocumentContent << std::endl;
std::string line; auto quote = std::make_unique<MarkdownMultilineQuote>(mMultilineTag);
while (std::getline(ss, line, '\n')) quote->appendTextContent(mDocumentContent);
mDocumentContent.clear();
mDocumentState = DocumentState::NONE;
mMarkdownDocument->addElement(std::move(quote));
onNewParagraph();
}
void MarkdownParser::onInlineQuote()
{
std::cout << "Adding inline quote " << mLineContent << std::endl;
auto quote = std::make_unique<MarkdownInlineQuote>();
quote->appendTextContent(mLineContent);
mLineContent.clear();
mLineState = LineState::NONE;
if(mWorkingParagraph)
{ {
processLine(line); mWorkingParagraph->addChild(std::move(quote));
} }
} }
HtmlDocumentPtr MarkdownParser::getHtml() void MarkdownParser::onHeading(unsigned level)
{ {
return std::move(mHtmlDocument); std::cout << "Adding heading: " << mLineContent << std::endl;
auto heading = std::make_unique<MarkdownHeading>(level);
heading->appendTextContent(mLineContent);
mMarkdownDocument->addElement(std::move(heading));
}
void MarkdownParser::onNewParagraph()
{
if (mWorkingParagraph)
{
onTextSpan();
if (!mWorkingParagraph->getNumChildren() == 0)
{
std::cout << "Adding para to document" << std::endl;
mMarkdownDocument->addElement(std::move(mWorkingParagraph));
}
}
mWorkingParagraph = std::make_unique<MarkdownParagraph>();
}
void MarkdownParser::onTextSpan()
{
mLineContent.clear();
if(mWorkingParagraph && !mDocumentContent.empty())
{
std::cout << "Adding text " << mDocumentContent << std::endl;
auto text_span = std::make_unique<MarkdownTextSpan>();
text_span->appendTextContent(mDocumentContent);
mWorkingParagraph->addChild(std::move(text_span));
mDocumentContent.clear();
}
}
std::pair<unsigned, bool> MarkdownParser::onTick(unsigned tickCount)
{
unsigned new_tick_count = tickCount;
bool stop_line_processing = false;
if (tickCount == 2)
{
if (mDocumentState == DocumentState::IN_MULTILINEQUOTE)
{
onMultilineQuote();
stop_line_processing = true;
}
else
{
onNewParagraph();
mLineState = LineState::IN_MULTILINE_TAG;
new_tick_count = 0;
mDocumentState = DocumentState::IN_MULTILINEQUOTE;
}
}
else if(mLineState == LineState::IN_INLINEQUOTE)
{
if (mLineContent.empty())
{
mLineState = LineState::NONE;
new_tick_count++;
}
else
{
new_tick_count = 0;
onInlineQuote();
}
}
else if(mDocumentState == DocumentState::IN_MULTILINEQUOTE)
{
new_tick_count++;
mLineContent += '`';
}
else
{
new_tick_count++;
mLineState = LineState::IN_INLINEQUOTE;
}
return {new_tick_count, stop_line_processing};
}
void MarkdownParser::processLine()
{
mLineContent.clear();
mLineState = LineState::NONE;
unsigned heading_level{0};
unsigned tick_count{0};
bool flushed_pre_inline = false;
for(auto c : mWorkingLine)
{
if (c == '`')
{
auto [ret_tick_count, stop_line_processing] = onTick(tick_count);
tick_count = ret_tick_count;
if(stop_line_processing)
{
return;
}
}
else
{
if (mLineState == LineState::IN_INLINEQUOTE)
{
if (!flushed_pre_inline)
{
std::cout << "Flushing pre-line " << std::endl;
mDocumentContent += mLineContent;
onTextSpan();
flushed_pre_inline = true;
}
mLineContent += c;
}
else if (mDocumentState == DocumentState::IN_MULTILINEQUOTE)
{
mLineContent += c;
}
else
{
if (c == '#')
{
onNewParagraph();
mLineState = LineState::IN_HEADING;
heading_level++;
}
else
{
mLineContent += c;
}
}
}
}
if (mLineState == LineState::IN_HEADING)
{
onHeading(heading_level);
}
else if(mLineState == LineState::IN_MULTILINE_TAG)
{
mMultilineTag = mLineContent;
}
else if (mLineState == LineState::IN_INLINEQUOTE)
{
onTextSpan();
}
else
{
if (mLineContent.size() > 0)
{
mDocumentContent.append(mLineContent);
}
}
}
void MarkdownParser::onEmptyLine()
{
onNewParagraph();
}
std::unique_ptr<MarkdownDocument> MarkdownParser::run(const std::string& content)
{
mMarkdownDocument = std::make_unique<MarkdownDocument>();
std::stringstream ss(content);
std::string line;
while (std::getline(ss, line, '\n'))
{
if (line.empty())
{
onEmptyLine();
continue;
}
mWorkingLine = line;
processLine();
}
onTextSpan();
onNewParagraph();
return std::move(mMarkdownDocument);
} }

View file

@ -1,29 +1,56 @@
#pragma once #pragma once
#include "HtmlDocument.h" #include <memory>
#include <string>
class MarkdownDocument;
class MarkdownParagraph;
class MarkdownParser class MarkdownParser
{ {
enum class DocumentState enum class DocumentState
{ {
None NONE,
IN_MULTILINEQUOTE
}; };
enum class LineState enum class LineState
{ {
None NONE,
IN_HEADING,
IN_INLINEQUOTE,
IN_MULTILINE_TAG
}; };
public: public:
MarkdownParser(); MarkdownParser();
HtmlDocumentPtr getHtml(); ~MarkdownParser();
void processLine(const std::string& line); std::unique_ptr<MarkdownDocument> run(const std::string& content);
void run(const std::string& content);
private: private:
DocumentState mDocumentState {DocumentState::None}; void processLine();
HtmlDocumentPtr mHtmlDocument;
void onMultilineQuote();
void onInlineQuote();
void onHeading(unsigned level);
void onEmptyLine();
void onNewParagraph();
void onTextSpan();
std::pair<unsigned, bool> onTick(unsigned tickCount);
std::string mWorkingLine;
std::string mLineContent;
std::string mDocumentContent;
std::string mMultilineTag;
LineState mLineState {LineState::NONE};
DocumentState mDocumentState {DocumentState::NONE};
std::unique_ptr<MarkdownParagraph> mWorkingParagraph{nullptr};
std::unique_ptr<MarkdownDocument> mMarkdownDocument;
}; };

View file

@ -3,51 +3,6 @@
#include "XmlDocument.h" #include "XmlDocument.h"
#include "XmlAttribute.h" #include "XmlAttribute.h"
std::string XmlWriter::toString(XmlElement* element, unsigned depth)
{
const auto prefix = std::string(2*depth, ' ');
auto content = prefix + "<" + element->getTagName();
for (std::size_t idx=0; idx< element->getNumAttributes(); idx++)
{
auto attribute = element->getAttribute(idx);
content += " " + attribute->getName() + "=\"" + attribute->getValue() + "\"";
}
const auto num_children = element->getNumChildren();
if (num_children == 0 && element->getText().empty())
{
content += "/>\n";
return content;
}
else
{
content += ">";
}
if (!element->getText().empty())
{
content += element->getText();
}
if (num_children>0)
{
content += "\n";
}
for (std::size_t idx=0; idx< element->getNumChildren(); idx++)
{
auto child = element->getChild(idx);
content += toString(child, depth+1);
}
if (num_children>0)
{
content += prefix;
}
content += "</" + element->getTagName() + ">\n";
return content;
}
std::string XmlWriter::toString(XmlDocument* document) std::string XmlWriter::toString(XmlDocument* document)
{ {
std::string content; std::string content;
@ -64,7 +19,7 @@ std::string XmlWriter::toString(XmlDocument* document)
if (auto root = document->getRoot()) if (auto root = document->getRoot())
{ {
content += toString(root); content += root->toString();
} }
return content; return content;
} }

View file

@ -11,7 +11,4 @@ public:
XmlWriter() = default; XmlWriter() = default;
std::string toString(XmlDocument* document); std::string toString(XmlDocument* document);
private:
std::string toString(XmlElement* element, unsigned depth=0);
}; };

View file

@ -49,6 +49,19 @@ void XmlElement::setText(const std::string& text)
mText = text; mText = text;
} }
XmlElement* XmlElement::getFirstChildWithTagName(const std::string& tag)
{
for(auto& child : mChildren)
{
if (child->getTagName() == tag)
{
return child.get();
}
}
return nullptr;
}
XmlAttribute* XmlElement::getAttribute(const std::string& attributeName) const XmlAttribute* XmlElement::getAttribute(const std::string& attributeName) const
{ {
for(const auto& attribute : mAttributes) for(const auto& attribute : mAttributes)
@ -84,3 +97,48 @@ XmlElement* XmlElement::getChild(std::size_t index) const
{ {
return mChildren[index].get(); return mChildren[index].get();
} }
std::string XmlElement::toString(unsigned depth) const
{
const auto prefix = std::string(2*depth, ' ');
auto content = prefix + "<" + getTagName();
for (std::size_t idx=0; idx< getNumAttributes(); idx++)
{
auto attribute = getAttribute(idx);
content += " " + attribute->getName() + "=\"" + attribute->getValue() + "\"";
}
const auto num_children = getNumChildren();
if (num_children == 0 && getText().empty())
{
content += "/>\n";
return content;
}
else
{
content += ">";
}
if (!getText().empty())
{
content += getText();
}
if (num_children>0)
{
content += "\n";
}
for (std::size_t idx=0; idx< getNumChildren(); idx++)
{
auto child = getChild(idx);
content += child->toString(depth+1);
}
if (num_children>0)
{
content += prefix;
}
content += "</" + getTagName() + ">\n";
return content;
}

View file

@ -29,9 +29,13 @@ public:
std::size_t getNumChildren() const; std::size_t getNumChildren() const;
XmlElement* getChild(std::size_t index) const; XmlElement* getChild(std::size_t index) const;
XmlElement* getFirstChildWithTagName(const std::string& tag);
void setText(const std::string& text); void setText(const std::string& text);
void setTagName(const std::string& tagName); void setTagName(const std::string& tagName);
virtual std::string toString(unsigned depth = 0) const;
protected: protected:
std::string mTagName; std::string mTagName;
std::string mText; std::string mText;

View file

@ -1,5 +1,10 @@
#include "MarkdownParser.h" #include "MarkdownParser.h"
#include "File.h" #include "File.h"
#include "HtmlDocument.h"
#include "MarkdownDocument.h"
#include "MarkdownConverter.h"
#include "HtmlWriter.h" #include "HtmlWriter.h"
#include "TestFramework.h" #include "TestFramework.h"
@ -11,9 +16,10 @@ TEST_CASE(TestMarkdownParser, "web")
const auto md_content = md_file.readText(); const auto md_content = md_file.readText();
MarkdownParser parser; MarkdownParser parser;
parser.run(md_content); auto md_doc = parser.run(md_content);
auto html = parser.getHtml(); MarkdownConverter converter;
auto html = converter.convert(md_doc.get());
HtmlWriter writer; HtmlWriter writer;
const auto html_string = writer.toString(html.get()); const auto html_string = writer.toString(html.get());