Back to TILs

C++ gumbo

Date: 2023-01-31Last modified: 2023-02-01

Table of contents

Introduction

void printElementStructure(GumboNode* node, string p) {
  fmt::print("\n{} has {} children:", p, node->v.element.children.length);
  for (unsigned int i = 0; i < node->v.element.children.length; ++i) {
    auto elem = static_cast<GumboNode*>(node->v.element.children.data[i]);
    if (elem->type == GUMBO_NODE_ELEMENT) {
      auto tag = gumbo_normalized_tagname(elem->v.element.tag);
      fmt::print("\n  {} -> TAG {}", p, tag);
      auto p2 = fmt::format("{}/{}", p, tag);
      printElementStructure(elem, p2);
    } else if (elem->type == GUMBO_NODE_TEXT) {
      fmt::print("\n  {} -> TEXT {}", p, elem->v.text.text);
    }
  }
};
  // GumboOutput* output = gumbo_parse("<h1>Hello, <span>World!</span></h1>");
  string markup{"<h1>Hello, <span>World!</span></h1>"};
  GumboOutput* output = gumbo_parse_with_options(
      &kGumboDefaultOptions, markup.data(), markup.length());

  assert(output->root->type == GUMBO_NODE_ELEMENT);

  // Node types:
  // GUMBO_NODE_DOCUMENT,
  // GUMBO_NODE_ELEMENT,
  // GUMBO_NODE_TEXT,
  // GUMBO_NODE_CDATA,
  // GUMBO_NODE_COMMENT,
  // GUMBO_NODE_WHITESPACE,
  // GUMBO_NODE_TEMPLATE

  fmt::print("\nRoot element tag: {}",
             gumbo_normalized_tagname(output->root->v.element.tag));  // html

  auto children = output->root->v.element.children;
  fmt::print("\nRoot element children: {}", children.length);  // 2: head, body

  printElementStructure(output->root, "html");

  // gumbo.h: extern const GumboOptions kGumboDefaultOptions;
  gumbo_destroy_output(&kGumboDefaultOptions, output);

Possible output


Root element tag: html
Root element children: 2
html has 2 children:
  html -> TAG head
html/head has 0 children:
  html -> TAG body
html/body has 1 children:
  html/body -> TAG h1
html/body/h1 has 2 children:
  html/body/h1 -> TEXT Hello, 
  html/body/h1 -> TAG span
html/body/h1/span has 1 children:
  html/body/h1/span -> TEXT World!

References