Improvements for markdown parsing.

This commit is contained in:
James Grogan 2022-12-06 18:02:43 +00:00
parent fc44290e3f
commit 8705859115
40 changed files with 957 additions and 537 deletions

View file

@ -1,15 +1,21 @@
#include "MarkdownParser.h"
#include "MarkdownDocument.h"
#include "StringUtils.h"
#include "MarkdownComponents.h"
#include "Lexer.h"
#include "StringUtils.h"
#include <sstream>
#include <iostream>
static constexpr char MULTILINE_QUOTE_DELIMITER[]{"```"};
static constexpr char HEADING_DELIMITER{'#'};
MarkdownParser::MarkdownParser()
{
mCustomMultilineDelimiters = {{"$$"}};
mCustomInlineDelimiters = {{"$"}};
}
MarkdownParser::~MarkdownParser()
@ -17,362 +23,345 @@ MarkdownParser::~MarkdownParser()
}
void MarkdownParser::onMultilineQuote()
bool MarkdownParser::isInMultilineBlock() const
{
auto quote = std::make_unique<MarkdownMultilineQuote>(mWorkingTag);
quote->appendTextContent(mDocumentContent);
mDocumentContent.clear();
mWorkingTag.clear();
mDocumentState = DocumentState::NONE;
mMarkdownDocument->addElement(std::move(quote));
onNewParagraph();
}
void MarkdownParser::onInlineQuote()
{
auto quote = std::make_unique<MarkdownInlineQuote>();
quote->appendTextContent(mLineContent);
mLineContent.clear();
mLineState = LineState::NONE;
if(mWorkingParagraph)
if (!mWorkingElement)
{
mWorkingParagraph->addChild(std::move(quote));
return false;
}
auto working_type = mWorkingElement->getType();
return working_type == MarkdownElement::Type::MULTILINE_QUOTE || working_type == MarkdownElement::Type::CUSTOM_MULTILINE ;
}
void MarkdownParser::onHeading(unsigned level)
unsigned MarkdownParser::checkForLink(const std::string& lineSection)
{
auto heading = std::make_unique<MarkdownHeading>(level);
heading->appendTextContent(mLineContent);
mMarkdownDocument->addElement(std::move(heading));
}
void MarkdownParser::onNewParagraph()
{
if (mWorkingBulletList)
if (lineSection.empty())
{
mMarkdownDocument->addElement(std::move(mWorkingBulletList));
mWorkingBulletList.reset();
mDocumentState == DocumentState::NONE;
return 0;
}
else if (mWorkingParagraph)
{
onTextSpan();
if (!mWorkingParagraph->getNumChildren() == 0)
std::vector<std::string> hits;
unsigned hit_size{0};
if (Lexer::matchPattern("[@](@)", lineSection, '@', hits))
{
if (hits.size() == 2)
{
mMarkdownDocument->addElement(std::move(mWorkingParagraph));
auto tag = hits[0];
auto target = hits[1];
onTextSpanFinished();
auto element = std::make_unique<MarkdownLink>(target);
element->appendTextContent(tag);
addChildToWorkingElement(std::move(element));
hit_size = 4 + tag.size() + target.size();
}
}
mWorkingParagraph = std::make_unique<MarkdownParagraph>();
return hit_size;
}
void MarkdownParser::onTextSpan()
unsigned MarkdownParser::checkForImage(const std::string& lineSection)
{
mLineContent.clear();
if(mWorkingParagraph && !mDocumentContent.empty())
if (lineSection.empty())
{
auto text_span = std::make_unique<MarkdownTextSpan>();
text_span->appendTextContent(mDocumentContent);
mWorkingParagraph->addChild(std::move(text_span));
mDocumentContent.clear();
return 0;
}
std::vector<std::string> hits;
unsigned hit_size{0};
if (Lexer::matchPattern("![@](@)", lineSection, '@', hits))
{
if (hits.size() == 2)
{
auto alt = hits[0];
auto source = hits[1];
onTextSpanFinished();
auto element = std::make_unique<MarkdownImage>(source, alt);
addChildToWorkingElement(std::move(element));
hit_size = 5 + alt.size() + source.size();
}
}
return hit_size;
}
std::pair<unsigned, bool> MarkdownParser::onTick(unsigned tickCount)
unsigned MarkdownParser::checkForInlineQuote(const std::string& lineSection)
{
unsigned new_tick_count = tickCount;
bool stop_line_processing = false;
if (tickCount == 2)
if (lineSection.empty())
{
if (mDocumentState == DocumentState::IN_MULTILINEQUOTE)
return 0;
}
std::vector<std::string> hits;
unsigned hit_size{0};
if (Lexer::matchPattern("`@`", lineSection, '@', hits))
{
if (hits.size() == 1)
{
onMultilineQuote();
stop_line_processing = true;
auto content = hits[0];
onTextSpanFinished();
auto element = std::make_unique<MarkdownInlineQuote>();
element->appendTextContent(content);
addChildToWorkingElement(std::move(element));
hit_size = 2 + content.size();
}
}
return hit_size;
}
unsigned MarkdownParser::checkForCustomInline(const std::string& lineSection)
{
if (lineSection.empty())
{
return 0;
}
std::vector<std::string> hits;
unsigned hit_size{0};
for(unsigned idx=0; idx<mCustomInlineDelimiters.size(); idx++)
{
const auto delimiter = mCustomInlineDelimiters[idx];
if (Lexer::matchPattern(delimiter + "@" + delimiter, lineSection, '@', hits))
{
if (hits.size() == 1)
{
auto content = hits[0];
onTextSpanFinished();
auto element = std::make_unique<MarkdownCustomInline>(delimiter);
element->appendTextContent(content);
addChildToWorkingElement(std::move(element));
hit_size = 2*delimiter.size() + content.size();
break;
}
}
}
return hit_size;
}
void MarkdownParser::onTextSpanFinished()
{
if (!mWorkingLine.empty())
{
if (mWorkingTextSpan)
{
std::cout << "Adding to existing text span: " << std::endl;
mWorkingTextSpan->appendTextContent(mWorkingLine);
}
else
{
onNewParagraph();
mLineState = LineState::IN_MULTILINE_TAG;
new_tick_count = 0;
mDocumentState = DocumentState::IN_MULTILINEQUOTE;
std::cout << "Adding new text span: " << mWorkingLine << std::endl;
auto text_span = std::make_unique<MarkdownTextSpan>();
text_span->addLine(mWorkingLine);
mWorkingTextSpan = text_span.get();
addChildToWorkingElement(std::move(text_span));
}
mWorkingLine.clear();
mWorkingTextSpan = nullptr;
}
}
void MarkdownParser::addChildToWorkingElement(std::unique_ptr<MarkdownInlineElement> child)
{
dynamic_cast<MarkdownElementWithChildren*>(mWorkingElement)->addChild(std::move(child));
}
void MarkdownParser::processLine(const std::string& line)
{
if (isInMultilineBlock())
{
mWorkingElement->addLine(line);
return;
}
if (!mWorkingElement)
{
std::cout << "Adding new paragraph " << std::endl;
auto paragraph = std::make_unique<MarkdownParagraph>();
mWorkingElement = paragraph.get();
mMarkdownDocument->addElement(std::move(paragraph));
}
if (mWorkingElement && mWorkingElement->getType() == MarkdownElement::Type::PARAGRAPH)
{
if (auto last_text_span = dynamic_cast<MarkdownParagraph*>(mWorkingElement)->getLastChild())
{
mWorkingTextSpan = last_text_span;
}
}
else if(mLineState == LineState::IN_INLINEQUOTE)
unsigned line_position = 0;
mWorkingLine.clear();
while(line_position < line.size())
{
if (mLineContent.empty())
const auto remaining = line.substr(line_position, line.size() - line_position);
if(auto length = checkForImage(remaining))
{
mLineState = LineState::NONE;
new_tick_count++;
line_position += length;
}
else if(auto length = checkForLink(remaining))
{
line_position += length;
}
else if(auto length = checkForInlineQuote(remaining))
{
line_position += length;
}
else if(auto length = checkForCustomInline(remaining))
{
line_position += length;
}
else
{
new_tick_count = 0;
onInlineQuote();
}
}
else if(mDocumentState == DocumentState::IN_MULTILINEQUOTE)
{
new_tick_count++;
mLineContent += '`';
}
else
{
new_tick_count++;
mLineState = LineState::IN_INLINEQUOTE;
}
return {new_tick_count, stop_line_processing};
}
void MarkdownParser::onLink()
{
auto element = std::make_unique<MarkdownLink>(mLineContent);
mLineContent.clear();
element->appendTextContent(mWorkingTag);
mWorkingTag.clear();
if (mWorkingParagraph)
{
mWorkingParagraph->addChild(std::move(element));
}
mLineState = LineState::NONE;
}
void MarkdownParser::onImage()
{
auto element = std::make_unique<MarkdownImage>(mLineContent, mWorkingTag);
mLineContent.clear();
element->appendTextContent(mWorkingTag);
mWorkingTag.clear();
if (mWorkingParagraph)
{
mWorkingParagraph->addChild(std::move(element));
}
mLineState = LineState::NONE;
}
void MarkdownParser::onBulletItem()
{
if (!mWorkingBulletList)
{
mWorkingBulletList = std::make_unique<MarkdownBulletList>();
mDocumentState == DocumentState::IN_BULLETS;
}
auto item = std::make_unique<MarkdownBulletItem>();
item->appendTextContent(mLineContent);
mLineContent.clear();
mWorkingBulletList->addChild(std::move(item));
}
void MarkdownParser::processLine()
{
mLineContent.clear();
mLineState = LineState::NONE;
unsigned heading_level{0};
unsigned tick_count{0};
bool flushed_pre_inline = false;
bool first_nonspace = false;
for(auto c : mWorkingLine)
{
if (!StringUtils::IsSpace(c))
{
if (first_nonspace)
{
first_nonspace = false;
}
else
{
first_nonspace = true;
}
}
else
{
first_nonspace = false;
}
if (c == '`')
{
auto [ret_tick_count, stop_line_processing] = onTick(tick_count);
tick_count = ret_tick_count;
if(stop_line_processing)
{
return;
}
}
else
{
if (mLineState == LineState::IN_INLINEQUOTE)
{
if (!flushed_pre_inline)
{
mDocumentContent += mLineContent;
onTextSpan();
flushed_pre_inline = true;
}
mLineContent += c;
}
else if (mDocumentState == DocumentState::IN_MULTILINEQUOTE)
{
mLineContent += c;
}
else if(mLineState == LineState::IN_LINK_TAG)
{
if (c == ']')
{
mLineState = LineState::AWAITING_LINK_BODY;
}
else
{
mWorkingTag += c;
}
}
else if(mLineState == LineState::AWAITING_LINK_BODY)
{
if (c == '(')
{
mLineState = LineState::IN_LINK_BODY;
}
else
{
mLineContent = '[' + mWorkingTag + ']';
mLineState = LineState::NONE;
}
}
else if(mLineState == LineState::IN_LINK_BODY)
{
if(c==')')
{
onLink();
}
else
{
mLineContent += c;
}
}
else if(mLineState == LineState::AWAITING_IMG_TAG)
{
if (c == '[')
{
mLineState = LineState::IN_IMG_TAG;
}
else
{
mLineContent = "![";
mLineState = LineState::NONE;
}
}
else if(mLineState == LineState::IN_IMG_TAG)
{
if (c == ']')
{
mLineState = LineState::AWAITING_IMG_BODY;
}
else
{
mWorkingTag += c;
}
}
else if(mLineState == LineState::AWAITING_IMG_BODY)
{
if (c == '(')
{
mLineState = LineState::IN_IMG_BODY;
}
else
{
mLineContent = "![" + mWorkingTag + "]";
mWorkingTag.clear();
mLineState = LineState::NONE;
}
}
else if(mLineState == LineState::IN_IMG_BODY)
{
if (c == ')')
{
onImage();
}
else
{
mLineContent += c;
}
}
else
{
if (c == '#')
{
onNewParagraph();
mLineState = LineState::IN_HEADING;
heading_level++;
}
else if(c == '[')
{
mDocumentContent += mLineContent;
onTextSpan();
mLineState = LineState::IN_LINK_TAG;
}
else if(c == '!')
{
mDocumentContent += mLineContent;
onTextSpan();
mLineState = LineState::AWAITING_IMG_TAG;
}
else if(first_nonspace && c == '*')
{
if (!mWorkingBulletList)
{
onNewParagraph();
}
mLineState = LineState::IN_BULLETS;
}
else
{
mLineContent += c;
}
}
}
}
if (mLineState == LineState::IN_HEADING)
{
onHeading(heading_level);
}
else if(mLineState == LineState::IN_MULTILINE_TAG)
{
mWorkingTag = mLineContent;
}
else if (mLineState == LineState::IN_INLINEQUOTE)
{
onTextSpan();
}
else if (mLineState == LineState::IN_BULLETS)
{
onBulletItem();
}
else
{
if (mLineContent.size() > 0)
{
mDocumentContent.append(mLineContent);
mWorkingLine += line[line_position];
line_position++;
}
}
onTextSpanFinished();
}
void MarkdownParser::onEmptyLine()
{
onNewParagraph();
if (!isInMultilineBlock())
{
onSectionFinished();
}
}
bool MarkdownParser::startsWithMultiLineQuote(const std::string& line) const
{
const bool ignore_whitespace{true};
return StringUtils::startsWith(line, MULTILINE_QUOTE_DELIMITER, ignore_whitespace);
}
int MarkdownParser::startsWithCustomMultilineBlock(const std::string& line) const
{
for(unsigned idx=0; idx<mCustomMultilineDelimiters.size(); idx++)
{
if (StringUtils::startsWith(line, mCustomMultilineDelimiters[idx], true))
{
return idx;
}
}
return -1;
}
bool MarkdownParser::startsWithHeading(const std::string& line) const
{
return StringUtils::startsWith(line, "#", true);
}
bool MarkdownParser::startsWithBulletItem(const std::string& line) const
{
return StringUtils::startsWith(line, "*", true);
}
void MarkdownParser::onFoundMultiLineQuote(const std::string& line)
{
if (mWorkingElement && mWorkingElement->getType() == MarkdownElement::Type::MULTILINE_QUOTE)
{
onSectionFinished();
}
else if(isInMultilineBlock())
{
processLine(line);
}
else
{
const auto tag = StringUtils::removeUpTo(line, MULTILINE_QUOTE_DELIMITER);
auto quote = std::make_unique<MarkdownMultilineQuote>(tag);
mWorkingElement = quote.get();
mMarkdownDocument->addElement(std::move(quote));
}
}
void MarkdownParser::onFoundCustomMultiLineBlock(const std::string& line, unsigned blockSlot)
{
if (mWorkingElement && mWorkingElement->getType() == MarkdownElement::Type::CUSTOM_MULTILINE && blockSlot == mCustomDelimiterIndex)
{
onSectionFinished();
}
else if(isInMultilineBlock())
{
processLine(line);
}
else
{
const auto delimiter = mCustomMultilineDelimiters[blockSlot];
const auto tag = StringUtils::removeUpTo(line, delimiter);
auto quote = std::make_unique<MarkdownCustomMultiLine>(tag, delimiter);
mWorkingElement = quote.get();
mMarkdownDocument->addElement(std::move(quote));
}
}
void MarkdownParser::onFoundHeading(const std::string& line)
{
if(isInMultilineBlock())
{
processLine(line);
}
else
{
onSectionFinished();
unsigned level = StringUtils::countFirstConsecutiveHits(line, HEADING_DELIMITER);
auto heading = std::make_unique<MarkdownHeading>(level);
std::string prefix;
for(unsigned idx=0; idx<level; idx++)
{
prefix += HEADING_DELIMITER;
}
heading->appendTextContent(StringUtils::stripSurroundingWhitepsace(StringUtils::removeUpTo(line, prefix)));
mMarkdownDocument->addElement(std::move(heading));
}
}
void MarkdownParser::onFoundBulletItem(const std::string& line)
{
if(isInMultilineBlock())
{
processLine(line);
}
else
{
if (mWorkingBulletList)
{
auto item = std::make_unique<MarkdownBulletItem>();
mWorkingElement = item.get();
mWorkingBulletList->addChild(std::move(item));
}
else
{
std::cout << "Starting new bullet list" << std::endl;
auto bullet_list = std::make_unique<MarkdownBulletList>();
mWorkingBulletList = bullet_list.get();
mMarkdownDocument->addElement(std::move(bullet_list));
auto bullet_item = std::make_unique<MarkdownBulletItem>();
mWorkingElement = bullet_item.get();
mWorkingBulletList->addChild(std::move(bullet_item));
processLine(StringUtils::removeUpTo(line, "*"));
}
}
}
void MarkdownParser::onSectionFinished()
{
std::cout << "Section is finished" << std::endl;
mWorkingElement = nullptr;
mWorkingBulletList = nullptr;
mWorkingTextSpan = nullptr;
}
std::unique_ptr<MarkdownDocument> MarkdownParser::run(const std::string& content)
@ -384,17 +373,39 @@ std::unique_ptr<MarkdownDocument> MarkdownParser::run(const std::string& content
while (std::getline(ss, line, '\n'))
{
if (line.empty())
std::cout << "Processing line " << line << std::endl;
if (StringUtils::isWhitespaceOnly(line))
{
std::cout << "Is whitespace only " << std::endl;
onEmptyLine();
continue;
}
mWorkingLine = line;
processLine();
else if (startsWithMultiLineQuote(line))
{
std::cout << "Found multiline quote" << std::endl;
onFoundMultiLineQuote(line);
}
else if (auto result = startsWithCustomMultilineBlock(line); result >= 0)
{
std::cout << "Found custom multiline" << std::endl;
onFoundCustomMultiLineBlock(line, result);
}
else if (startsWithHeading(line))
{
std::cout << "Found heading" << std::endl;
onFoundHeading(line);
}
else if(startsWithBulletItem(line))
{
std::cout << "Found bulletitem" << std::endl;
onFoundBulletItem(line);
}
else
{
std::cout << "Found nothing - process line" << std::endl;
processLine(line);
}
}
onTextSpan();
onNewParagraph();
return std::move(mMarkdownDocument);
}