Word Document to Markdown

Convert a Microsoft Word document to Markdown in the browser.

Face with waiting expression Nothing to see yet!

Loading takeymakey...
TakeyMakey code
Want this tool to do something else? Edit the code below and make it do whatever you want.
import mammoth from "https://cdn.jsdelivr.net/npm/mammoth@1.9.0/+esm"

export const take = [
  {
    type: "file",
    label: "Document",
    accept: ".doc,.docx,application/msword",
    read: "buffer",
  },
  { type: "toggle", label: "Strip excess backslashes", value: true },
  { type: "toggle", label: "Remove excess new lines", value: true },
  { type: "toggle", label: "Extract footnotes and endnotes", value: true },
  { type: "toggle", label: "Remove internal links", value: true },
  { type: "toggle", label: "Remove anchor tags around URLs", value: true },
  { type: "toggle", label: "Embed images as base64 strings", value: false },
  {
    type: "dropdown",
    label: "Split chapters",
    options: [
      { label: "Never", value: false },
      { label: "Level 1 headings", value: /\n# / },
      { label: "Level 1 and 2 headings", value: /\n##? / },
    ],
  },
]

export const make = async ([
  arrayBuffer,
  stripSlashes,
  stripLines,
  extractNotes,
  stripInternalLinks,
  flattenLinks,
  base64img,
  splitChapters,
]) => {
  if (arrayBuffer) {
    let imageCount = 1
    const convertImage = base64img
      ? null
      : mammoth.images.imgElement((image) => ({
          alt: "Image " + imageCount++,
          src: image.contentType,
        }))

    let value = (
      await mammoth.convertToMarkdown({ arrayBuffer }, { convertImage })
    ).value

    // By default, Mammoth backslashes every ambiguous Markdown character
    if (stripSlashes) value = format.stripSlashes(value)

    // Never allow more than two newlines in a row
    if (stripLines) value = format.stripLines(value)

    // Remove the default formatting around footnotes and endnotes, converting
    // them to use the semi-conventional [^N] syntax
    if (extractNotes) value = format.extractNotes(value)

    // Remove internal links (with a leading #) and their related, empty anchors
    if (stripInternalLinks) value = format.stripInternalLinks(value)

    // If the text and URL match, don't wrap them in Markdown syntax
    if (flattenLinks) value = format.flattenLinks(value)

    if (splitChapters) {
      return value.split(splitChapters).map((item) => {
        const [label, ...rest] = item.split("\n")

        return {
          type: "group",
          label,
          value: [
            {
              type: "code",
              label: false,
              value: rest.join("\n").trim(),
            },
          ],
        }
      })
    }

    return [{ type: "code", value }]
  }
}

const format = {
  stripSlashes: (value) => {
    return value.replace(/\\/g, "")
  },
  stripLines: (value) => {
    return value
      .replace(/\n +/g, "\n")
      .replace(/\n\n\s*/g, "\n\n")
      .trim()
  },
  extractNotes: (value) => {
    const notes = value.match(/\[\[\d+\]\]\(#\w+-\d+\)/g)

    if (notes) {
      notes.forEach((note) => {
        const index = note.slice(2, note.indexOf("]"))

        value = value
          .replace(note, `[^${index}]`)
          .replace(
            new RegExp(`${index}. <a id="\\w+-${index}"></a> `),
            `[^${index}]: `
          )
          .replace(
            new RegExp(
              `\\[↑\\]\\(#[\\w-]+-${index}\\)|<a id="\\w+-ref-${index}"></a>`,
              "g"
            ),
            ""
          )
      })
    }

    return value
  },
  stripInternalLinks: (value) => {
    return value
      .replace(/\[([^\]]+)\]\(#[^\)]+\)/g, "$1")
      .replace(/<a id="[^"]+"><\/a>/g, "")
  },
  flattenLinks: (value) => {
    return value.replace(/\[([^\]]+)\]\(([^\)]+)\)/g, (_, a, b) => {
      return a === b ? a : _
    })
  },
}