Word Document to Markdown

Convert a Microsoft Word document to Markdown in the browser.

Face with waiting expression Nothing to see yet!

Loading takeymakey...
TakeyMakey code
Want this tool to do something else? Edit the code below and make it do whatever you want.
import mammoth from "https://cdn.jsdelivr.net/npm/mammoth@1.9.0/+esm"

export const take = [
  {
    type: "file",
    label: "Document",
    accept: ".doc,.docx,application/msword",
    read: "buffer",
  },
  { type: "toggle", label: "Strip excess backslashes", value: true },
  { type: "toggle", label: "Remove excess new lines", value: true },
  { type: "toggle", label: "Extract footnotes", value: true },
  { type: "toggle", label: "Remove internal links", value: true },
  { type: "toggle", label: "Remove anchor tags around URLs", value: true },
  { type: "toggle", label: "Embed images as base64 strings", value: false },
]

export const make = async ([
  arrayBuffer,
  stripSlashes,
  stripLines,
  extractNotes,
  stripInternalLinks,
  flattenLinks,
  base64img,
]) => {
  let value

  if (arrayBuffer) {
    const convertImage = base64img
      ? null
      : mammoth.images.imgElement((image) => ({
          src: `<!-- ${image.contentType} -->`,
        }))

    value = (await mammoth.convertToMarkdown({ arrayBuffer }, { convertImage }))
      .value

    if (stripSlashes) value = format.stripSlashes(value)
    if (stripLines) value = format.stripLines(value)
    if (extractNotes) value = format.extractNotes(value)
    if (stripInternalLinks) value = format.stripInternalLinks(value)
    if (flattenLinks) value = format.flattenLinks(value)

    return [{ type: "code", value }]
  }
}

const format = {
  stripSlashes: (value) => {
    // By default, Mammoth backslashes every ambiguous Markdown character
    return value.replace(/\\/g, "")
  },
  stripLines: (value) => {
    // Never allow more than two newlines in a row
    return value
      .replace(/\n +/g, "\n")
      .replace(/\n\n\s*/g, "\n\n")
      .trim()
  },
  extractNotes: (value) => {
    // Remove the default formatting around footnotes and endnotes, converting
    // them to use the semi-conventional [^N] syntax
    const notes = value.match(/\[\[\d+\]\]\(#\w+-\d+\)/g)

    notes.forEach((note) => {
      const index = note.slice(2, note.indexOf("]"))

      value = value
        .replace(note, `[^${index}]`)
        .replace(
          new RegExp(`${index}. <a id="\\w+-${index}"></a> `, "s"),
          `[^${index}]: `
        )
        .replace(new RegExp(`\\[↑\\]\\(#[\\w-]+-${index}\\)`), "")
    })

    return value
  },
  stripInternalLinks: (value) => {
    // Remove internal links (with a leading #) and their related, empty anchors
    return value
      .replace(/\[([^\]]+)\]\(#[^\)]+\)/g, "$1")
      .replace(/<a id="[^"]+"><\/a>/g, "")
  },
  flattenLinks: (value) => {
    // If the text and URL match, don't wrap them in Markdown syntax
    return value.replace(/\[([^\]]+)\]\(([^\)]+)\)/g, (_, a, b) => {
      return a === b ? a : _
    })
  },
}