Convert a Microsoft Word document to Markdown in the browser.
import mammoth from "https://cdn.jsdelivr.net/npm/mammoth@1.9.0/+esm"
export const take = [
{
type: "file",
label: "Document",
accept: ".doc,.docx,application/msword",
read: "buffer",
},
{ type: "toggle", label: "Strip excess backslashes", value: true },
{ type: "toggle", label: "Remove excess new lines", value: true },
{ type: "toggle", label: "Extract footnotes and endnotes", value: true },
{ type: "toggle", label: "Remove internal links", value: true },
{ type: "toggle", label: "Remove anchor tags around URLs", value: true },
{ type: "toggle", label: "Embed images as base64 strings", value: false },
{
type: "dropdown",
label: "Split chapters",
options: [
{ label: "Never", value: false },
{ label: "Level 1 headings", value: /\n# / },
{ label: "Level 1 and 2 headings", value: /\n##? / },
],
},
]
export const make = async ([
arrayBuffer,
stripSlashes,
stripLines,
extractNotes,
stripInternalLinks,
flattenLinks,
base64img,
splitChapters,
]) => {
if (arrayBuffer) {
let imageCount = 1
const convertImage = base64img
? null
: mammoth.images.imgElement((image) => ({
alt: "Image " + imageCount++,
src: image.contentType,
}))
let value = (
await mammoth.convertToMarkdown({ arrayBuffer }, { convertImage })
).value
// By default, Mammoth backslashes every ambiguous Markdown character
if (stripSlashes) value = format.stripSlashes(value)
// Never allow more than two newlines in a row
if (stripLines) value = format.stripLines(value)
// Remove the default formatting around footnotes and endnotes, converting
// them to use the semi-conventional [^N] syntax
if (extractNotes) value = format.extractNotes(value)
// Remove internal links (with a leading #) and their related, empty anchors
if (stripInternalLinks) value = format.stripInternalLinks(value)
// If the text and URL match, don't wrap them in Markdown syntax
if (flattenLinks) value = format.flattenLinks(value)
if (splitChapters) {
return value.split(splitChapters).map((item) => {
const [label, ...rest] = item.split("\n")
return {
type: "group",
label,
value: [
{
type: "code",
label: false,
value: rest.join("\n").trim(),
},
],
}
})
}
return [{ type: "code", value }]
}
}
const format = {
stripSlashes: (value) => {
return value.replace(/\\/g, "")
},
stripLines: (value) => {
return value
.replace(/\n +/g, "\n")
.replace(/\n\n\s*/g, "\n\n")
.trim()
},
extractNotes: (value) => {
const notes = value.match(/\[\[\d+\]\]\(#\w+-\d+\)/g)
if (notes) {
notes.forEach((note) => {
const index = note.slice(2, note.indexOf("]"))
value = value
.replace(note, `[^${index}]`)
.replace(
new RegExp(`${index}. <a id="\\w+-${index}"></a> `),
`[^${index}]: `
)
.replace(
new RegExp(
`\\[↑\\]\\(#[\\w-]+-${index}\\)|<a id="\\w+-ref-${index}"></a>`,
"g"
),
""
)
})
}
return value
},
stripInternalLinks: (value) => {
return value
.replace(/\[([^\]]+)\]\(#[^\)]+\)/g, "$1")
.replace(/<a id="[^"]+"><\/a>/g, "")
},
flattenLinks: (value) => {
return value.replace(/\[([^\]]+)\]\(([^\)]+)\)/g, (_, a, b) => {
return a === b ? a : _
})
},
}