Word Document to Markdown

Face with waiting expression Nothing to see yet!
Loading takeymakey...
TakeyMakey code
import mammoth from "https://cdn.jsdelivr.net/npm/mammoth@1.9.0/+esm"

export const take = [
  {
    type: "file",
    label: "Document",
    accept: ".doc,.docx,application/msword",
    read: "buffer",
  },
  { type: "toggle", label: "Strip excess backslashes", value: true },
  { type: "toggle", label: "Remove excess new lines", value: true },
  { type: "toggle", label: "Extract footnotes and endnotes", value: true },
  { type: "toggle", label: "Remove internal links", value: true },
  { type: "toggle", label: "Remove anchor tags around URLs", value: true },
  { type: "toggle", label: "Embed images as base64 strings", value: false },
  {
    type: "dropdown",
    label: "Split chapters",
    options: [
      { label: "Never", value: false },
      { label: "Level 1 headings", value: /\n# / },
      { label: "Level 1 and 2 headings", value: /\n##? / },
    ],
  },
]

export const make = async ([
  arrayBuffer,
  stripSlashes,
  stripLines,
  extractNotes,
  stripInternalLinks,
  flattenLinks,
  base64img,
  splitChapters,
]) => {
  if (arrayBuffer) {
    let imageCount = 1
    const convertImage = base64img
      ? null
      : mammoth.images.imgElement((image) => ({
          alt: "Image " + imageCount++,
          src: image.contentType,
        }))

    let value = (
      await mammoth.convertToMarkdown({ arrayBuffer }, { convertImage })
    ).value

    // By default, Mammoth backslashes every ambiguous Markdown character
    if (stripSlashes) value = format.stripSlashes(value)

    // Never allow more than two newlines in a row
    if (stripLines) value = format.stripLines(value)

    // Remove the default formatting around footnotes and endnotes, converting
    // them to use the semi-conventional [^N] syntax
    if (extractNotes) value = format.extractNotes(value)

    // Remove internal links (with a leading #) and their related, empty anchors
    if (stripInternalLinks) value = format.stripInternalLinks(value)

    // If the text and URL match, don't wrap them in Markdown syntax
    if (flattenLinks) value = format.flattenLinks(value)

    if (splitChapters) {
      return value.split(splitChapters).map((item) => {
        const [label, ...rest] = item.split("\n")

        return {
          type: "group",
          label,
          value: [
            {
              type: "code",
              label: false,
              value: rest.join("\n").trim(),
            },
          ],
        }
      })
    }

    return [{ type: "code", value }]
  }
}

const format = {
  stripSlashes: (value) => {
    return value.replace(/\\/g, "")
  },
  stripLines: (value) => {
    return value
      .replace(/\n +/g, "\n")
      .replace(/\n\n\s*/g, "\n\n")
      .trim()
  },
  extractNotes: (value) => {
    const notes = value.match(/\[\[\d+\]\]\(#\w+-\d+\)/g)

    if (notes) {
      notes.forEach((note) => {
        const index = note.slice(2, note.indexOf("]"))

        value = value
          .replace(note, `[^${index}]`)
          .replace(
            new RegExp(`${index}. <a id="\\w+-${index}"></a> `),
            `[^${index}]: `
          )
          .replace(
            new RegExp(
              `\\[↑\\]\\(#[\\w-]+-${index}\\)|<a id="\\w+-ref-${index}"></a>`,
              "g"
            ),
            ""
          )
      })
    }

    return value
  },
  stripInternalLinks: (value) => {
    return value
      .replace(/\[([^\]]+)\]\(#[^\)]+\)/g, "$1")
      .replace(/<a id="[^"]+"><\/a>/g, "")
  },
  flattenLinks: (value) => {
    return value.replace(/\[([^\]]+)\]\(([^\)]+)\)/g, (_, a, b) => {
      return a === b ? a : _
    })
  },
}