Basic workflow for converting docx to unified format (#7779)

* Basic workflow for converting docx to unified format

Signed-off-by: Victor Ilyushchenko <alt13ri@gmail.com>

* dep fix

Signed-off-by: Victor Ilyushchenko <alt13ri@gmail.com>

* @vercel/webpack-asset-relocator-loader can't handle mammoth for some reason, moved dep to import-tool

Signed-off-by: Victor Ilyushchenko <alt13ri@gmail.com>

---------

Signed-off-by: Victor Ilyushchenko <alt13ri@gmail.com>
This commit is contained in:
Victor Ilyushchenko 2025-01-24 12:13:40 +03:00 committed by GitHub
parent 9a29cde7a6
commit d6b7a38af9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 342 additions and 41 deletions

View File

@ -1727,8 +1727,8 @@ importers:
specifier: ^3.1.0
version: 3.1.0
mammoth:
specifier: ^1.6.0
version: 1.8.0
specifier: ^1.9.0
version: 1.9.0
markdown-it:
specifier: ^14.0.0
version: 14.0.0
@ -4119,7 +4119,7 @@ packages:
version: 0.0.0
'@rush-temp/import-tool@file:projects/import-tool.tgz':
resolution: {integrity: sha512-imMlneB1gppaXcJi4brs9jxXdGgK5zUGanT6vjF2+lNnrRCnp1X3Ys6/iRVLlexVOnSsA6BTpaZ1Xo5SSTO/gg==, tarball: file:projects/import-tool.tgz}
resolution: {integrity: sha512-yTXXuY90bmLrEEiipvDCuv+mLPokLCSLajsI3+hEES0iCdCbiJlua2NYv73WVshCcu0oq1acpRPElKxELjNgDw==, tarball: file:projects/import-tool.tgz}
version: 0.0.0
'@rush-temp/importer@file:projects/importer.tgz':
@ -4587,7 +4587,7 @@ packages:
version: 0.0.0
'@rush-temp/pod-print@file:projects/pod-print.tgz':
resolution: {integrity: sha512-19hcUtJBpca/kmHtIv4Z30rvHIaMLeGm3PdirKFuVS7VAdmYV6Um+m4o+5fq2LUUcda4T10vHscPeAvZR8EPEA==, tarball: file:projects/pod-print.tgz}
resolution: {integrity: sha512-uH6mY0Z3/3bbAe+rGjr2WPJmavKnoNgKZ5KoLqToGGF+iDzZOZGLjd8OrwUI90Z3behW7PPIRuPDGiRmFGOtcw==, tarball: file:projects/pod-print.tgz}
version: 0.0.0
'@rush-temp/pod-server@file:projects/pod-server.tgz':
@ -4671,7 +4671,7 @@ packages:
version: 0.0.0
'@rush-temp/qms-doc-import-tool@file:projects/qms-doc-import-tool.tgz':
resolution: {integrity: sha512-QqdrovP6ZWs8dmU+6Ly95n8BFMjlvtMcme4uj9XgPqCKMOmuMhtQ0Wn5ae3h8hzgB1K2HK25F0BPrzhRBGlxTA==, tarball: file:projects/qms-doc-import-tool.tgz}
resolution: {integrity: sha512-m7UFAU/1lPMVaVWNf5rvDKrRWTxOzjuWinK48EQ8OSJD3JSB5SP/IHrW9zSckgWBnLYL1vnljAgjM0QZ2qjIlQ==, tarball: file:projects/qms-doc-import-tool.tgz}
version: 0.0.0
'@rush-temp/qms-tests-sanity@file:projects/qms-tests-sanity.tgz':
@ -4711,7 +4711,7 @@ packages:
version: 0.0.0
'@rush-temp/rekoni-service@file:projects/rekoni-service.tgz':
resolution: {integrity: sha512-c3Vh1CX471Q8N6l5hoftqqktDZ7PuqzOXwcnhy5bNc14ynVi8Q9mTV7hHeP5xCFzQzepXCQ+tF5Etparbn2pdQ==, tarball: file:projects/rekoni-service.tgz}
resolution: {integrity: sha512-KwM2th57U3OVRVgPsNgcakA3gCpIYsASv2TeeqJAbv0cFim1ha1xDaeb8A96O9vTHjHRknDfgqQVYYIUdosQwQ==, tarball: file:projects/rekoni-service.tgz}
version: 0.0.0
'@rush-temp/rekoni@file:projects/rekoni.tgz':
@ -10276,8 +10276,8 @@ packages:
resolution: {integrity: sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==}
hasBin: true
lop@0.4.1:
resolution: {integrity: sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ==}
lop@0.4.2:
resolution: {integrity: sha512-RefILVDQ4DKoRZsJ4Pj22TxE3omDO47yFpkIBoDKzkqPRISs5U1cnAdg/5583YPkWPaLIYHOKRMQSvjFsO26cw==}
lower-case@2.0.2:
resolution: {integrity: sha512-7fm3l3NAF9WfN6W3JOmf5drwpVqX78JtoGJ3A6W0a6ZnldM41w2fV5D490psKFTpMds8TJse/eHLFFsNHHjHgg==}
@ -10328,8 +10328,8 @@ packages:
makeerror@1.0.12:
resolution: {integrity: sha512-JmqCvUhmt43madlpFzG4BQzG2Z3m6tvQDNKdClZnO3VbIudJYmxsT0FNJMeiB2+JTSlTQTSbU8QdesVmwJcmLg==}
mammoth@1.8.0:
resolution: {integrity: sha512-pJNfxSk9IEGVpau+tsZFz22ofjUsl2mnA5eT8PjPs2n0BP+rhVte4Nez6FdgEuxv3IGI3afiV46ImKqTGDVlbA==}
mammoth@1.9.0:
resolution: {integrity: sha512-F+0NxzankQV9XSUAuVKvkdQK0GbtGGuqVnND9aVf9VSeUA82LQa29GjLqYU6Eez8LHqSJG3eGiDW3224OKdpZg==}
engines: {node: '>=12.0.0'}
hasBin: true
@ -18227,6 +18227,7 @@ snapshots:
'@rush-temp/import-tool@file:projects/import-tool.tgz(@babel/core@7.23.9)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.23.9))':
dependencies:
'@types/jest': 29.5.12
'@types/js-yaml': 4.0.9
'@types/node': 20.11.19
'@typescript-eslint/eslint-plugin': 6.21.0(@typescript-eslint/parser@6.21.0(eslint@8.56.0)(typescript@5.3.3))(eslint@8.56.0)(typescript@5.6.2)
'@typescript-eslint/parser': 6.21.0(eslint@8.56.0)(typescript@5.6.2)
@ -18239,6 +18240,8 @@ snapshots:
eslint-plugin-n: 15.7.0(eslint@8.56.0)
eslint-plugin-promise: 6.1.1(eslint@8.56.0)
jest: 29.7.0(@types/node@20.11.19)(ts-node@10.9.2(@types/node@20.11.19)(typescript@5.3.3))
js-yaml: 4.1.0
mammoth: 1.9.0
prettier: 3.2.5
ts-jest: 29.1.2(@babel/core@7.23.9)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.23.9))(esbuild@0.24.2)(jest@29.7.0(@types/node@20.11.19)(ts-node@10.9.2(@types/node@20.11.19)(typescript@5.3.3)))(typescript@5.6.2)
ts-node: 10.9.2(@types/node@20.11.19)(typescript@5.6.2)
@ -20760,7 +20763,7 @@ snapshots:
eslint-plugin-promise: 6.1.1(eslint@8.56.0)
express: 4.21.2
jest: 29.7.0(@types/node@20.11.19)(ts-node@10.9.2(@types/node@20.11.19)(typescript@5.3.3))
mammoth: 1.8.0
mammoth: 1.9.0
prettier: 3.2.5
puppeteer: 22.14.0(bufferutil@4.0.8)(typescript@5.3.3)(utf-8-validate@6.0.4)
ts-jest: 29.1.2(@babel/core@7.23.9)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.23.9))(esbuild@0.24.2)(jest@29.7.0(@types/node@20.11.19)(ts-node@10.9.2(@types/node@20.11.19)(typescript@5.3.3)))(typescript@5.3.3)
@ -21533,7 +21536,7 @@ snapshots:
eslint-plugin-promise: 6.1.1(eslint@8.56.0)
htmlparser2: 9.1.0
jest: 29.7.0(@types/node@20.11.19)(ts-node@10.9.2(@types/node@20.11.19)(typescript@5.3.3))
mammoth: 1.8.0
mammoth: 1.9.0
prettier: 3.2.5
ts-jest: 29.1.2(@babel/core@7.23.9)(@jest/types@29.6.3)(babel-jest@29.7.0(@babel/core@7.23.9))(esbuild@0.24.2)(jest@29.7.0(@types/node@20.11.19)(ts-node@10.9.2(@types/node@20.11.19)(typescript@5.3.3)))(typescript@5.3.3)
ts-node: 10.9.2(@types/node@20.11.19)(typescript@5.3.3)
@ -21842,7 +21845,7 @@ snapshots:
jimp: 0.16.13
jwt-simple: 0.5.6
libphonenumber-js: 1.10.56
mammoth: 1.8.0
mammoth: 1.9.0
mime-types: 2.1.35
morgan: 1.10.0
node-loader: 2.0.0(webpack@5.97.1)
@ -32113,7 +32116,7 @@ snapshots:
dependencies:
js-tokens: 4.0.0
lop@0.4.1:
lop@0.4.2:
dependencies:
duck: 0.1.12
option: 0.2.4
@ -32185,7 +32188,7 @@ snapshots:
dependencies:
tmpl: 1.0.5
mammoth@1.8.0:
mammoth@1.9.0:
dependencies:
'@xmldom/xmldom': 0.8.10
argparse: 1.0.10
@ -32193,7 +32196,7 @@ snapshots:
bluebird: 3.4.7
dingbat-to-unicode: 1.0.1
jszip: 3.10.1
lop: 0.4.1
lop: 0.4.2
path-is-absolute: 1.0.1
underscore: 1.13.7
xmlbuilder: 10.1.1

View File

@ -63,7 +63,7 @@
"domhandler": "^5.0.3",
"domutils": "^3.1.0",
"htmlparser2": "^9.0.0",
"mammoth": "^1.6.0",
"mammoth": "^1.9.0",
"docx4js": "^3.2.20",
"zod": "^3.22.4"
}

View File

@ -47,13 +47,16 @@
"eslint-plugin-import": "^2.26.0",
"eslint-plugin-n": "^15.4.0",
"eslint-plugin-promise": "^6.1.1",
"prettier": "^3.1.0"
"prettier": "^3.1.0",
"@types/js-yaml": "^4.0.9"
},
"dependencies": {
"@hcengineering/core": "^0.6.32",
"@hcengineering/platform": "^0.6.11",
"@hcengineering/server-client": "^0.6.0",
"@hcengineering/importer": "^0.6.1",
"commander": "^8.1.0"
"commander": "^8.1.0",
"js-yaml": "^4.1.0",
"mammoth": "^1.9.0"
}
}

View File

@ -13,6 +13,18 @@
// limitations under the License.
//
import { concatLink, TxOperations } from '@hcengineering/core'
import {
ClickupImporter,
defaultDocumentPreprocessors,
DocumentConverter,
FrontFileUploader,
importNotion,
UnifiedFormatImporter,
type DocumentConverterOptions,
type FileUploader,
type Logger
} from '@hcengineering/importer'
import { setMetadata } from '@hcengineering/platform'
import serverClientPlugin, {
createClient,
getUserWorkspaces,
@ -20,15 +32,10 @@ import serverClientPlugin, {
selectWorkspace
} from '@hcengineering/server-client'
import { program } from 'commander'
import { setMetadata } from '@hcengineering/platform'
import {
UnifiedFormatImporter,
ClickupImporter,
importNotion,
FrontFileUploader,
type FileUploader,
type Logger
} from '@hcengineering/importer'
import { readFileSync } from 'fs'
import * as yaml from 'js-yaml'
import mammoth from 'mammoth'
import { join } from 'path'
class ConsoleLogger implements Logger {
log (msg: string, data?: any): void {
@ -165,5 +172,38 @@ export function importTool (): void {
})
})
program
.command('convert-qms-docx <dir>')
.requiredOption('-o, --out <dir>', 'out')
.option('-c, --config <file>', 'configPath')
.description('convert QMS document into Unified Huly Format')
.action(async (dir: string, cmd) => {
const { out, configPath } = cmd
const configSearchPath = configPath ?? join(dir, 'import.yaml')
let config: DocumentConverterOptions
try {
const configYaml = readFileSync(configSearchPath, 'utf-8')
const configFromFile = yaml.load(configYaml) as DocumentConverterOptions
config = { ...configFromFile, outputPath: out }
} catch (e: any) {
console.error(`Unable to load config file from ${configSearchPath}: ${e}`)
return
}
config.steps = [
{ name: '_extractImages' },
{ name: '_cleanupMarkup' },
...config.steps,
{ name: '_addStubHeader' }
]
config.htmlConverter = async (path) => (await mammoth.convertToHtml({ path })).value
const converter = new DocumentConverter(config, defaultDocumentPreprocessors)
await converter.processFolder(dir)
await converter.flush()
})
program.parse(process.argv)
}

View File

@ -0,0 +1,127 @@
//
// Copyright © 2025 Hardcore Engineering Inc.
//
// Licensed under the Eclipse Public License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. You may
// obtain a copy of the License at https://www.eclipse.org/legal/epl-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//
// See the License for the specific language governing permissions and
// limitations under the License.
//
import { defaultExtensions, htmlToJSON, MarkupNode, serializeMessage } from '@hcengineering/text'
import { mkdir, readdir, readFile, writeFile } from 'fs/promises'
import * as yaml from 'js-yaml'
import { basename, dirname, extname, join, relative } from 'path'
import { UnifiedControlledDocumentHeader, UnifiedDocumentTemplateHeader } from '../huly/unified'
export interface DocumentConverterOptions {
outputPath: string
owner: string
steps: DocumentPreprocessorOptions<any>[]
htmlConverter: (path: string) => Promise<string>
}
export interface DocumentState {
name: string
path: string
root: string
markup: MarkupNode
header?: UnifiedControlledDocumentHeader | UnifiedDocumentTemplateHeader
}
export interface DocumentPreprocessorOptions<T> {
name: string
options?: T
}
export type DocumentPreprocessor = (document: DocumentState) => DocumentState | undefined
export type DocumentPreprocessorSpec<T> = (converter: DocumentConverter, options?: T) => DocumentPreprocessor
export class DocumentConverter {
documents = new Map<string, DocumentState>()
output = new Map<string, Buffer | string>()
preprocessors: DocumentPreprocessor[]
options: DocumentConverterOptions
constructor (options: DocumentConverterOptions, specs: Record<string, DocumentPreprocessorSpec<any>>) {
this.options = options
this.preprocessors = []
for (const step of options.steps) {
const spec = specs[step.name]
if (spec === undefined) {
throw new Error(`Unknown step: ${step.name}`)
}
this.preprocessors.push(spec(this, step.options))
}
}
async processFolder (root: string): Promise<void> {
const files = await scanFiles(root)
for (const path of files) {
const ext = extname(path)
if (ext === '.docx') await this.processDocument(path, root)
else if (ext === '.md') this.addOutputFile(relative(root, path), await readFile(path, 'utf-8'))
}
}
async processDocument (path: string, root: string): Promise<void> {
const htmlString = await this.options.htmlConverter(path)
const markup = htmlToJSON(htmlString, defaultExtensions)
let document: DocumentState = {
name: fileNameNoExt(path),
path,
root,
markup
}
for (const processor of this.preprocessors) {
document = processor(document) ?? document
}
this.documents.set(path, document)
const content = compileMarkdown(document)
this.addOutputFile(join(relative(root, dirname(path)), fileNameNoExt(path)) + '.md', content)
}
addOutputFile (rel: string, content: string | Buffer): void {
this.output.set(join(this.options.outputPath, rel), content)
}
async flush (): Promise<void> {
for (const [path, content] of this.output) {
await mkdir(dirname(path), { recursive: true })
await writeFile(path, content as any)
}
}
}
function compileMarkdown (file: DocumentState): string {
const markdown = serializeMessage(file.markup, 'ref://', '')
const headerYaml = yaml.dump(file.header)
const headerString = '---\n' + headerYaml + '---\n'
const finalContent = headerString + markdown
return finalContent
}
function fileNameNoExt (path: string): string {
const bname = basename(path)
const ext = extname(path)
return bname.slice(0, bname.length - ext.length)
}
async function scanFiles (dir: string): Promise<string[]> {
const filesAndDirs = await readdir(dir, { recursive: true, withFileTypes: true })
const files = filesAndDirs.filter((file) => !file.isDirectory()).map((f) => join(f.path, f.name))
return files
}

View File

@ -0,0 +1,126 @@
//
// Copyright © 2025 Hardcore Engineering Inc.
//
// Licensed under the Eclipse Public License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. You may
// obtain a copy of the License at https://www.eclipse.org/legal/epl-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//
// See the License for the specific language governing permissions and
// limitations under the License.
//
import { AttrValue, MarkupNode, MarkupNodeType } from '@hcengineering/text'
import { dirname, join, relative } from 'path'
import { DocumentPreprocessorSpec, DocumentState } from './docx'
import documents from '@hcengineering/controlled-documents'
const _addStubHeader: DocumentPreprocessorSpec<DocumentState['header']> = (converter, inputOptions) => {
return (document) => {
const options: DocumentState['header'] = inputOptions ?? {
class: 'documents:class:ControlledDocument',
title: document.name,
template: documents.template.ProductChangeControl,
author: converter.options.owner,
owner: converter.options.owner
}
const header = document.header ?? options
return { ...document, header }
}
}
interface ExtractImagesOptions {
folder?: string
extensions?: Record<string, string>
}
const _extractImages: DocumentPreprocessorSpec<ExtractImagesOptions> = (converter, inputOptions) => {
const options = {
folder: 'files',
extensions: {
'image/jpeg': '.jpeg',
'image/jpg': '.jpeg',
'image/png': '.png'
},
...inputOptions
}
let imageCount = 0
interface Image {
extension: string
buffer: Buffer
}
const extractBase64Image = (imageContent: AttrValue): Image | undefined => {
if (typeof imageContent !== 'string' || !imageContent.startsWith('data:')) {
return
}
const buffer = Buffer.from(imageContent.split(',')[1], 'base64')
const type = imageContent.split(';')[0].split(':')[1]
const extension = options.extensions[type]
if (extension === undefined) {
return
}
return { buffer, extension }
}
const transformImage = (dir: string, node: MarkupNode): MarkupNode => {
if (node.type !== MarkupNodeType.image) {
return node
}
const image = extractBase64Image(node.attrs?.src ?? '')
if (image === undefined) {
return node
}
imageCount++
const path = join(options.folder, 'image_' + imageCount + image.extension)
node = { ...node, attrs: { ...node.attrs, src: relative(dir, path) } }
converter.addOutputFile(path, image.buffer)
return node
}
return (document) => {
const dir = relative(document.root, dirname(document.path))
const markup = transformMarkupRecursive(document.markup, (node) => transformImage(dir, node))
return { ...document, markup }
}
}
const _cleanupMarkup: DocumentPreprocessorSpec<any> = (converter) => {
const transform = (node: MarkupNode): MarkupNode => {
if (node.type === MarkupNodeType.table_header) {
node = { ...node, type: MarkupNodeType.table_cell }
}
return node
}
return (document) => {
const markup = transformMarkupRecursive(document.markup, transform)
return { ...document, markup }
}
}
export const defaultDocumentPreprocessors = {
_addStubHeader,
_extractImages,
_cleanupMarkup
}
function transformMarkupRecursive (node: MarkupNode, transformer: (node: MarkupNode) => MarkupNode): MarkupNode {
let content = node.content
if (content !== undefined) {
content = content.map((node) => transformMarkupRecursive(node, transformer))
node = { ...node, content }
}
return transformer(node)
}

View File

@ -50,13 +50,13 @@ import documents, {
DocumentMeta
} from '@hcengineering/controlled-documents'
interface UnifiedComment {
export interface UnifiedComment {
author: string
text: string
attachments?: string[]
}
interface UnifiedIssueHeader {
export interface UnifiedIssueHeader {
class: 'tracker:class:Issue'
title: string
status: string
@ -67,7 +67,7 @@ interface UnifiedIssueHeader {
comments?: UnifiedComment[]
}
interface UnifiedSpaceSettings {
export interface UnifiedSpaceSettings {
class: 'tracker:class:Project' | 'document:class:Teamspace' | 'documents:class:OrgSpace'
title: string
private?: boolean
@ -79,7 +79,7 @@ interface UnifiedSpaceSettings {
emoji?: string
}
interface UnifiedProjectSettings extends UnifiedSpaceSettings {
export interface UnifiedProjectSettings extends UnifiedSpaceSettings {
class: 'tracker:class:Project'
identifier: string
id?: 'tracker:project:DefaultProject'
@ -87,16 +87,16 @@ interface UnifiedProjectSettings extends UnifiedSpaceSettings {
defaultIssueStatus?: string
}
interface UnifiedTeamspaceSettings extends UnifiedSpaceSettings {
export interface UnifiedTeamspaceSettings extends UnifiedSpaceSettings {
class: 'document:class:Teamspace'
}
interface UnifiedDocumentHeader {
export interface UnifiedDocumentHeader {
class: 'document:class:Document'
title: string
}
interface UnifiedWorkspaceSettings {
export interface UnifiedWorkspaceSettings {
projectTypes?: Array<{
name: string
taskTypes?: Array<{
@ -110,13 +110,13 @@ interface UnifiedWorkspaceSettings {
}>
}
interface UnifiedChangeControlHeader {
export interface UnifiedChangeControlHeader {
description?: string
reason?: string
impact?: string
}
interface UnifiedControlledDocumentHeader {
export interface UnifiedControlledDocumentHeader {
class: 'documents:class:ControlledDocument'
title: string
template: string
@ -129,7 +129,7 @@ interface UnifiedControlledDocumentHeader {
changeControl?: UnifiedChangeControlHeader
}
interface UnifiedDocumentTemplateHeader {
export interface UnifiedDocumentTemplateHeader {
class: 'documents:mixin:DocumentTemplate'
title: string
category: string
@ -143,7 +143,7 @@ interface UnifiedDocumentTemplateHeader {
changeControl?: UnifiedChangeControlHeader
}
interface UnifiedOrgSpaceSettings extends UnifiedSpaceSettings {
export interface UnifiedOrgSpaceSettings extends UnifiedSpaceSettings {
class: 'documents:class:OrgSpace'
qualified?: string
manager?: string

View File

@ -16,6 +16,8 @@
export * from './huly/unified'
export * from './clickup/clickup'
export * from './notion/notion'
export * from './docx/docx'
export * from './docx/preprocessors'
export * from './importer/uploader'
export * from './importer/storageUploader'

View File

@ -63,7 +63,7 @@
"dotenv": "~16.0.0",
"express": "^4.21.2",
"puppeteer": "^22.6.1",
"mammoth": "^1.6.0",
"mammoth": "^1.9.0",
"ws": "^8.18.0"
}
}

View File

@ -81,7 +81,7 @@
"jimp": "^0.16.1",
"jwt-simple": "^0.5.6",
"libphonenumber-js": "^1.9.46",
"mammoth": "^1.6.0",
"mammoth": "^1.9.0",
"mime-types": "~2.1.34",
"pdfjs-dist": "2.12.313",
"sharp": "~0.32.0",