From c55e6f00b0e4fa2d3e9cef5aeaa906c533a2d39a Mon Sep 17 00:00:00 2001 From: Alexey Zinoviev Date: Thu, 1 Aug 2024 09:05:07 +0400 Subject: [PATCH] EZQMS-1145: Fixes doc import tool (#6204) * EZQMS-1145: Fixes doc import tool Signed-off-by: Alexey Zinoviev --- common/config/rush/pnpm-lock.yaml | 177 ++++++++++++++++++- dev/doc-import-tool/package.json | 1 + dev/doc-import-tool/readme.md | 8 + dev/doc-import-tool/src/__start.ts | 4 +- dev/doc-import-tool/src/commands.ts | 15 +- dev/doc-import-tool/src/extract/extract.ts | 12 +- dev/doc-import-tool/src/extract/meta.ts | 45 ++++- dev/doc-import-tool/src/extract/nodes.ts | 22 ++- dev/doc-import-tool/src/extract/types.ts | 22 ++- dev/doc-import-tool/src/import.ts | 40 ++--- dev/doc-import-tool/src/index.ts | 16 +- dev/doc-import-tool/src/type/docx4js.d.ts | 3 + dev/doc-import-tool/toc2.json | 34 ++++ models/controlled-documents/src/migration.ts | 1 - plugins/controlled-documents/src/docutils.ts | 6 +- 15 files changed, 346 insertions(+), 60 deletions(-) create mode 100644 dev/doc-import-tool/readme.md create mode 100644 dev/doc-import-tool/src/type/docx4js.d.ts create mode 100644 dev/doc-import-tool/toc2.json diff --git a/common/config/rush/pnpm-lock.yaml b/common/config/rush/pnpm-lock.yaml index 13fd0bd30f..cdc24873f0 100644 --- a/common/config/rush/pnpm-lock.yaml +++ b/common/config/rush/pnpm-lock.yaml @@ -1403,6 +1403,9 @@ dependencies: diff2html: specifier: ~3.4.35 version: 3.4.48 + docx4js: + specifier: ^3.2.20 + version: 3.2.20 domhandler: specifier: ^5.0.3 version: 5.0.3 @@ -11414,6 +11417,15 @@ packages: - debug dev: false + /cfb@0.12.1: + resolution: {integrity: sha512-cP+4A0tTqtyza5gJwNwDetZ8FPjl0gPLE7mIxGKyUzOS6HkM23WaAWW/l3t7jIQSMqVXroa09Ey0lo7gV8LNxw==} + engines: {node: '>=0.8'} + hasBin: true + dependencies: + commander: 2.11.0 + printj: 1.1.2 + dev: false + /chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -11436,6 +11448,28 @@ packages: engines: {node: '>=10'} dev: false + /cheerio@0.22.0: + resolution: {integrity: sha512-8/MzidM6G/TgRelkzDG13y3Y9LxBjCb+8yOEZ9+wwq5gVF2w2pV0wmHvjfT0RvuxGyR7UEuK36r+yYMbT4uKgA==} + engines: {node: '>= 0.6'} + dependencies: + css-select: 1.2.0 + dom-serializer: 0.1.1 + entities: 1.1.2 + htmlparser2: 3.10.1 + lodash.assignin: 4.2.0 + lodash.bind: 4.2.1 + lodash.defaults: 4.2.0 + lodash.filter: 4.6.0 + lodash.flatten: 4.4.0 + lodash.foreach: 4.5.0 + lodash.map: 4.6.0 + lodash.merge: 4.6.2 + lodash.pick: 4.4.0 + lodash.reduce: 4.6.0 + lodash.reject: 4.6.0 + lodash.some: 4.6.0 + dev: false + /chokidar@3.6.0: resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==} engines: {node: '>= 8.10.0'} @@ -11686,6 +11720,10 @@ packages: engines: {node: '>=14'} dev: false + /commander@2.11.0: + resolution: {integrity: sha512-b0553uYA5YAEGgyYIGYROzKQ7X5RAqedkfjiZxwi0kL1g3bOaBNNZfYkzt/CL0umgD5wc9Jec2FbB98CjkMRvQ==} + dev: false + /commander@2.20.3: resolution: {integrity: sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==} dev: false @@ -12110,6 +12148,15 @@ packages: webpack: 5.90.3(esbuild@0.20.1)(webpack-cli@5.1.4) dev: false + /css-select@1.2.0: + resolution: {integrity: sha512-dUQOBoqdR7QwV90WysXPLXG5LO7nhYBgiWVfxF80DKPF8zx1t/pUd2FYy73emg3zrjtM6dzmYgbHKfV2rxiHQA==} + dependencies: + boolbase: 1.0.0 + css-what: 2.1.3 + domutils: 1.5.1 + nth-check: 1.0.2 + dev: false + /css-select@4.3.0: resolution: {integrity: sha512-wPpOYtnsVontu2mODhA19JrqWxNsfdatRKd64kmpRbQgh1KtItko5sTnEpPdpSaJszTOhEMlF/RPz28qj4HqhQ==} dependencies: @@ -12136,6 +12183,10 @@ packages: source-map-js: 1.0.2 dev: false + /css-what@2.1.3: + resolution: {integrity: sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg==} + dev: false + /css-what@6.1.0: resolution: {integrity: sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==} engines: {node: '>= 6'} @@ -12636,6 +12687,16 @@ packages: esutils: 2.0.3 dev: false + /docx4js@3.2.20: + resolution: {integrity: sha512-u7kfMRYAHsczusgnrRAnZ0bXSF1HqiQmKtAI5LNIoQfrYXUHmaukcXnJR98KEFW3fSVdYKuLyHP9vKhrJKzetw==} + dependencies: + cfb: 0.12.1 + cheerio: 0.22.0 + color: 3.2.1 + htmlparser2: 3.10.1 + jszip: 2.7.0 + dev: false + /dom-accessibility-api@0.5.16: resolution: {integrity: sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==} dev: false @@ -12646,6 +12707,20 @@ packages: utila: 0.4.0 dev: false + /dom-serializer@0.1.1: + resolution: {integrity: sha512-l0IU0pPzLWSHBcieZbpOKgkIn3ts3vAh7ZuFyXNwJxJXk/c4Gwj9xaTJwIDVQCXawWD0qb3IzMGH5rglQaO0XA==} + dependencies: + domelementtype: 1.3.1 + entities: 1.1.2 + dev: false + + /dom-serializer@0.2.2: + resolution: {integrity: sha512-2/xPb3ORsQ42nHYiSunXkDjPLBaEj/xTwUO4B7XCZQTRk7EBtTOPaygh10YAAh2OI1Qrp6NWfpAhzswj0ydt9g==} + dependencies: + domelementtype: 2.3.0 + entities: 2.2.0 + dev: false + /dom-serializer@1.4.1: resolution: {integrity: sha512-VHwB3KfrcOOkelEG2ZOfxqLZdfkil8PtJi4P8N2MMXucZq2yLp75ClViUlOVwyoHEDjYU433Aq+5zWP61+RGag==} dependencies: @@ -12709,6 +12784,20 @@ packages: domelementtype: 2.3.0 dev: false + /domutils@1.5.1: + resolution: {integrity: sha512-gSu5Oi/I+3wDENBsOWBiRK1eoGxcywYSqg3rR960/+EfY0CF4EX1VPkgHOZ3WiS/Jg2DtliF6BhWcHlfpYUcGw==} + dependencies: + dom-serializer: 0.1.1 + domelementtype: 1.3.1 + dev: false + + /domutils@1.7.0: + resolution: {integrity: sha512-Lgd2XcJ/NjEw+7tFvfKxOzCYKZsdct5lczQ2ZaQY8Djz7pfAD3Gbp8ySJWtreII/vDlMVmxwa6pHmdxIYgttDg==} + dependencies: + dom-serializer: 0.2.2 + domelementtype: 1.3.1 + dev: false + /domutils@2.8.0: resolution: {integrity: sha512-w96Cjofp72M5IIhpjgobBimYEfoPjx1Vx0BSX9P30WBdZW2WIKU0T1Bd0kz2eNZ9ikjKgHbEyKx8BB6H1L3h3A==} dependencies: @@ -13056,6 +13145,10 @@ packages: strip-ansi: 6.0.1 dev: false + /entities@1.1.2: + resolution: {integrity: sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==} + dev: false + /entities@2.2.0: resolution: {integrity: sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A==} dev: false @@ -15402,6 +15495,17 @@ packages: entities: 2.2.0 dev: false + /htmlparser2@3.10.1: + resolution: {integrity: sha512-IgieNijUMbkDovyoKObU1DUhm1iwNYE/fuifEoEHfd1oZKZDaONBSkal7Y01shxsM49R4XaMdGez3WnF9UfiCQ==} + dependencies: + domelementtype: 1.3.1 + domhandler: 2.4.2 + domutils: 1.7.0 + entities: 1.1.2 + inherits: 2.0.4 + readable-stream: 3.6.2 + dev: false + /htmlparser2@6.1.0: resolution: {integrity: sha512-gyyPk6rgonLFEDGoeRgQNaEUvdJ4ktTmmUh/h2t7s+M8oPpIPxgNACWa+6ESR57kXstwqPiCut0V8NRpcwgU7A==} dependencies: @@ -16991,6 +17095,12 @@ packages: object.values: 1.2.0 dev: false + /jszip@2.7.0: + resolution: {integrity: sha512-JIsRKRVC3gTRo2vM4Wy9WBC3TRcfnIZU8k65Phi3izkvPH975FowRYtKGT6PxevA0XnJ/yO8b0QwV0ydVyQwfw==} + dependencies: + pako: 1.0.11 + dev: false + /jszip@3.10.1: resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==} dependencies: @@ -17384,14 +17494,38 @@ packages: p-locate: 6.0.0 dev: false + /lodash.assignin@4.2.0: + resolution: {integrity: sha512-yX/rx6d/UTVh7sSVWVSIMjfnz95evAgDFdb1ZozC35I9mSFCkmzptOzevxjgbQUsc78NR44LVHWjsoMQXy9FDg==} + dev: false + + /lodash.bind@4.2.1: + resolution: {integrity: sha512-lxdsn7xxlCymgLYo1gGvVrfHmkjDiyqVv62FAeF2i5ta72BipE1SLxw8hPEPLhD4/247Ijw07UQH7Hq/chT5LA==} + dev: false + /lodash.debounce@4.0.8: resolution: {integrity: sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==} dev: false + /lodash.defaults@4.2.0: + resolution: {integrity: sha512-qjxPLHd3r5DnsdGacqOMU6pb/avJzdh9tFX2ymgoZE27BmjXrNy/y4LoaiTeAb+O3gL8AfpJGtqfX/ae2leYYQ==} + dev: false + /lodash.escaperegexp@4.1.2: resolution: {integrity: sha512-TM9YBvyC84ZxE3rgfefxUWiQKLilstD6k7PTGt6wfbtXF8ixIJLOL3VYyV/z+ZiPLsVxAsKAFVwWlWeb2Y8Yyw==} dev: false + /lodash.filter@4.6.0: + resolution: {integrity: sha512-pXYUy7PR8BCLwX5mgJ/aNtyOvuJTdZAo9EQFUvMIYugqmJxnrYaANvTbgndOzHSCSR0wnlBBfRXJL5SbWxo3FQ==} + dev: false + + /lodash.flatten@4.4.0: + resolution: {integrity: sha512-C5N2Z3DgnnKr0LOpv/hKCgKdb7ZZwafIrsesve6lmzvZIRZRGaZ/l6Q8+2W7NaT+ZwO3fFlSCzCzrDCFdJfZ4g==} + dev: false + + /lodash.foreach@4.5.0: + resolution: {integrity: sha512-aEXTF4d+m05rVOAUG3z4vZZ4xVexLKZGF0lIxuHZ1Hplpk/3B6Z1+/ICICYRLm7c41Z2xiejbkCkJoTlypoXhQ==} + dev: false + /lodash.includes@4.3.0: resolution: {integrity: sha512-W3Bx6mdkRTGtlJISOvVD/lbqjTlPPUDTMnlXZFnVwi9NKJ6tiAk6LVdlhZMm17VZisqhKcgzpO5Wz91PCt5b0w==} dev: false @@ -17420,6 +17554,10 @@ packages: resolution: {integrity: sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==} dev: false + /lodash.map@4.6.0: + resolution: {integrity: sha512-worNHGKLDetmcEYDvh2stPCrrQRkP20E4l0iIS7F8EvzMqBBi7ltvFN5m1HvTf1P7Jk1txKhvFcmYsCr8O2F1Q==} + dev: false + /lodash.memoize@4.1.2: resolution: {integrity: sha512-t7j+NzmgnQzTAYXcsHYLgimltOV1MXHtlOWf6GjL9Kj8GK5FInw5JotxvbOs+IvV1/Dzo04/fCGfLVs7aXb4Ag==} dev: false @@ -17432,6 +17570,22 @@ packages: resolution: {integrity: sha512-Sb487aTOCr9drQVL8pIxOzVhafOjZN9UU54hiN8PU3uAiSV7lx1yYNpbNmex2PK6dSJoNTSJUUswT651yww3Mg==} dev: false + /lodash.pick@4.4.0: + resolution: {integrity: sha512-hXt6Ul/5yWjfklSGvLQl8vM//l3FtyHZeuelpzK6mm99pNvN9yTDruNZPEJZD1oWrqo+izBmB7oUfWgcCX7s4Q==} + dev: false + + /lodash.reduce@4.6.0: + resolution: {integrity: sha512-6raRe2vxCYBhpBu+B+TtNGUzah+hQjVdu3E17wfusjyrXBka2nBS8OH/gjVZ5PvHOhWmIZTYri09Z6n/QfnNMw==} + dev: false + + /lodash.reject@4.6.0: + resolution: {integrity: sha512-qkTuvgEzYdyhiJBx42YPzPo71R1aEr0z79kAv7Ixg8wPFEjgRgJdUsGMG3Hf3OYSF/kHI79XhNlt+5Ar6OzwxQ==} + dev: false + + /lodash.some@4.6.0: + resolution: {integrity: sha512-j7MJE+TuT51q9ggt4fSgVqro163BEFjAt3u97IqU+JA2DkWl80nFTrowzLpZ/BnpN7rrl0JA/593NAdd8p/scQ==} + dev: false + /lodash@4.17.21: resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==} dev: false @@ -18271,6 +18425,12 @@ packages: path-key: 4.0.0 dev: false + /nth-check@1.0.2: + resolution: {integrity: sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==} + dependencies: + boolbase: 1.0.0 + dev: false + /nth-check@2.1.1: resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} dependencies: @@ -19358,6 +19518,12 @@ packages: engines: {node: '>= 0.8'} dev: false + /printj@1.1.2: + resolution: {integrity: sha512-zA2SmoLaxZyArQTOPj5LXecR+RagfPSU5Kw1qP+jkWeNlrq+eJZyY2oS68SU1Z/7/myXM4lo9716laOFAVStCQ==} + engines: {node: '>=0.8'} + hasBin: true + dev: false + /process-nextick-args@2.0.1: resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==} dev: false @@ -28764,7 +28930,7 @@ packages: dev: false file:projects/pod-calendar.tgz(bufferutil@4.0.8)(ts-node@10.9.2)(utf-8-validate@6.0.4): - resolution: {integrity: sha512-a3IO+LV4RLUrLCIwlzK6HfwU0q4QkepgRaQi4/ybG+He0Oqy/eWvZnHHXry0FjArSrZjr2QOnxOlYsipXv/zfg==, tarball: file:projects/pod-calendar.tgz} + resolution: {integrity: sha512-DOvMMTPpDOAuO8Vi8gEBTXBMQATYdZgtNWxvUyG0xbCArW9jEmKbrOVBV1ydZGOJqfqhennjv09+uWR/YUR+SQ==, tarball: file:projects/pod-calendar.tgz} id: file:projects/pod-calendar.tgz name: '@rush-temp/pod-calendar' version: 0.0.0 @@ -28994,7 +29160,7 @@ packages: dev: false file:projects/pod-gmail.tgz(bufferutil@4.0.8)(ts-node@10.9.2)(utf-8-validate@6.0.4): - resolution: {integrity: sha512-EgPrNV5SyE8PVkKfTc30asvvzcTaDEZnQ1krUOi8rIzQcgLuPIRgMwTc6yuqu3SAzC2zPrwudTMqzDPpwJTRxw==, tarball: file:projects/pod-gmail.tgz} + resolution: {integrity: sha512-ybpB+uVlKzWvEVZnrp6iPuDYR7OwWBEsHl3ivSmte8BhfH9Q5QfzBm/FrEUsyEBIN4KS/cGDUoNGv5Us5OVWVw==, tarball: file:projects/pod-gmail.tgz} id: file:projects/pod-gmail.tgz name: '@rush-temp/pod-gmail' version: 0.0.0 @@ -29056,7 +29222,7 @@ packages: dev: false file:projects/pod-love.tgz(bufferutil@4.0.8)(utf-8-validate@6.0.4): - resolution: {integrity: sha512-h64U5de7eDYWI5zbLFQtUEArWz+OVeQ3BWtCd2yy9Yw3uzgPOE+etmib6KtcC0MEcDSl3ZNN0UOrV1IDQzy+nQ==, tarball: file:projects/pod-love.tgz} + resolution: {integrity: sha512-g12q7ZxpvWr3/5nAZazdcKgw2rC4LVpWLPYs7qYtm4avIIGeu4FIT8WfV0E4wXPVdjdV4CX2WwZ7+1zSgDxYCA==, tarball: file:projects/pod-love.tgz} id: file:projects/pod-love.tgz name: '@rush-temp/pod-love' version: 0.0.0 @@ -29282,7 +29448,7 @@ packages: dev: false file:projects/pod-telegram.tgz(bufferutil@4.0.8)(ts-node@10.9.2)(utf-8-validate@6.0.4): - resolution: {integrity: sha512-qt/UwzvrmgaVI2yI5vdGPHaghv5JF+ysgUY9GwwarkzqlZNBScnUo262CYNWy26iPdhVqNbvGYPrAgVecGHBFg==, tarball: file:projects/pod-telegram.tgz} + resolution: {integrity: sha512-mBbMXZhgRl0R9gvFN+uxfdFHgXdF9cFyC200rXnS4PgGkGVm9jRmE8lJQTxiOOJyX8rPDm8qlLhCrlKyof+pEQ==, tarball: file:projects/pod-telegram.tgz} id: file:projects/pod-telegram.tgz name: '@rush-temp/pod-telegram' version: 0.0.0 @@ -29733,7 +29899,7 @@ packages: dev: false file:projects/qms-doc-import-tool.tgz: - resolution: {integrity: sha512-s2EDYV09exzo01lbGDlzMq9D0cx2OILHS8bqQCgntVVWEK9nQypMd4gi/nCbmVhyHRapQih7D81Nf6zuSUohJg==, tarball: file:projects/qms-doc-import-tool.tgz} + resolution: {integrity: sha512-jxeLHsk5jNj+ABvXoCiX9VTv7ovVKEdkwddcmwpdTGjt7hBgtYsirnVxkryV8tfWMs5y99wjroKGfwSqtyzzOg==, tarball: file:projects/qms-doc-import-tool.tgz} name: '@rush-temp/qms-doc-import-tool' version: 0.0.0 dependencies: @@ -29747,6 +29913,7 @@ packages: '@typescript-eslint/parser': 6.21.0(eslint@8.56.0)(typescript@5.3.3) commander: 8.3.0 cross-env: 7.0.3 + docx4js: 3.2.20 domhandler: 5.0.3 domutils: 3.1.0 esbuild: 0.20.1 diff --git a/dev/doc-import-tool/package.json b/dev/doc-import-tool/package.json index 4f74c8dd0c..897e8ed144 100644 --- a/dev/doc-import-tool/package.json +++ b/dev/doc-import-tool/package.json @@ -65,6 +65,7 @@ "form-data": "^4.0.0", "htmlparser2": "^9.0.0", "mammoth": "^1.6.0", + "docx4js": "^3.2.20", "node-fetch": "^2.6.6", "zod": "^3.22.4" } diff --git a/dev/doc-import-tool/readme.md b/dev/doc-import-tool/readme.md new file mode 100644 index 0000000000..9d6b60c801 --- /dev/null +++ b/dev/doc-import-tool/readme.md @@ -0,0 +1,8 @@ +_Note: if vscode fails to resolve docx4js.d.ts types the following fragment need to be added to compilerOptions in tsconfig.json_ + +``` + "typeRoots": [ + "./src/type", + "./node_modules/@types" + ], +``` \ No newline at end of file diff --git a/dev/doc-import-tool/src/__start.ts b/dev/doc-import-tool/src/__start.ts index 6da5803e4a..4bffe38b98 100644 --- a/dev/doc-import-tool/src/__start.ts +++ b/dev/doc-import-tool/src/__start.ts @@ -14,6 +14,4 @@ // import { docImportTool } from '.' -const productId = process.env.PRODUCT_ID ?? 'ezqms' - -docImportTool(productId) +docImportTool() diff --git a/dev/doc-import-tool/src/commands.ts b/dev/doc-import-tool/src/commands.ts index 209c8aef65..fe88bee01f 100644 --- a/dev/doc-import-tool/src/commands.ts +++ b/dev/doc-import-tool/src/commands.ts @@ -1,5 +1,8 @@ +import docx4js from 'docx4js' +import { AnyNode } from 'domhandler' + import extract from './extract/extract' -import { read } from './extract/types' +import { MetadataContainer, read } from './extract/types' import importExtractedFile from './import' import convert from './convert/convert' import { Config } from './config' @@ -10,9 +13,15 @@ export async function importDoc (config: Config): Promise { const spec = await read(specFile) console.log(`Spec: ${JSON.stringify(spec, undefined, 2)}`) + let headerRoot: AnyNode | undefined + if (spec.metadata.in === MetadataContainer.PageHeaderTableRow) { + const headerIdx = spec.metadata.headerIdx ?? 1 + const docx = await docx4js.load(config.doc) + headerRoot = docx.getObjectPart(`word/header${headerIdx}.xml`).root()[0] + } + const contents = await convert(doc, backend) - const extractedFile = await extract(contents, spec) - // console.log(`Extracted data: ${JSON.stringify(extractedFile, undefined, 2)}`) + const extractedFile = await extract(contents, spec, headerRoot) await importExtractedFile(config, extractedFile) } diff --git a/dev/doc-import-tool/src/extract/extract.ts b/dev/doc-import-tool/src/extract/extract.ts index 8fbc8af385..ffe118ae88 100644 --- a/dev/doc-import-tool/src/extract/extract.ts +++ b/dev/doc-import-tool/src/extract/extract.ts @@ -1,5 +1,5 @@ import { parseDocument } from 'htmlparser2' -import { Document } from 'domhandler' +import { AnyNode, Document } from 'domhandler' import { FileSpec, FileSpecType, TocFileSpec } from './types' import { createMetadataExtractor } from './meta' @@ -28,10 +28,10 @@ class TocContentExtractor implements ContentExtractor { readonly type = FileSpecType.TOC ) {} - extract (doc: Document): ExtractedFile { + extract (doc: Document, headerRoot?: AnyNode): ExtractedFile { const metadataExtractor = createMetadataExtractor(this.spec.metadata) - const title = metadataExtractor.extractName(doc) - const oldId = metadataExtractor.extractId(doc) + const title = metadataExtractor.extractName(doc, headerRoot) + const oldId = metadataExtractor.extractId(doc, headerRoot) const docSpec = this.spec.spec @@ -59,10 +59,10 @@ class TocContentExtractor implements ContentExtractor { * @public * Extracts HTML file contents */ -export async function extract (contents: string, spec: FileSpec): Promise { +export async function extract (contents: string, spec: FileSpec, headerRoot?: AnyNode): Promise { const extractor = new TocContentExtractor(spec) const doc = parseDocument(contents) - return extractor.extract(doc) + return extractor.extract(doc, headerRoot) } export default extract diff --git a/dev/doc-import-tool/src/extract/meta.ts b/dev/doc-import-tool/src/extract/meta.ts index 3ddb4dec9d..edf47bb2f5 100644 --- a/dev/doc-import-tool/src/extract/meta.ts +++ b/dev/doc-import-tool/src/extract/meta.ts @@ -1,8 +1,15 @@ -import { Document, Element } from 'domhandler' +import { AnyNode, Document, Element, Text } from 'domhandler' import { find } from 'domutils' import { ElementType } from 'htmlparser2' -import { DocMetadataSpec, MetadataContainer, DocTableRowMetadata, DocMetaTagsMetadata } from './types' +import { + DocMetadataSpec, + MetadataContainer, + DocTableRowMetadata, + DocMetaTagsMetadata, + PageHeaderTableRowMetadata, + MetadataTableCell +} from './types' import { ELEMENT_LIMIT } from './common' import { TableNodeExtractor } from './nodes' import { TableContainer } from './container' @@ -73,7 +80,37 @@ export class TableRowDocMetadataExtractor implements DocMetadataExtractor { } } -type AnyDocMetadataExtractor = MetaTagsDocMetadataExtractor | TableRowDocMetadataExtractor +const maxElems = 10000 +export class PageHeaderTableRowDocMetadataExtractor implements DocMetadataExtractor { + constructor (readonly tableMetadata: PageHeaderTableRowMetadata) {} + + private getCellText (meta: MetadataTableCell, headerRoot?: AnyNode): string { + if (headerRoot === undefined) { + return '' + } + + const rows = find((n) => n.type === ElementType.Tag && n.name === 'w:tr', [headerRoot], true, maxElems) + const { row, col, slice } = meta.extract + const cell = find((n) => n.type === ElementType.Tag && n.name === 'w:tc', [rows[row]], true, maxElems)[col] + const textNodes = find((n) => n.type === ElementType.Text, [cell], true, maxElems) as Text[] + const text = textNodes.map((n) => n.data).join('') + + return slice === undefined ? text : text.slice(slice.start, slice.end) + } + + extractName (doc: Document, headerRoot?: AnyNode): string { + return this.getCellText(this.tableMetadata.docName, headerRoot) + } + + extractId (doc: Document, headerRoot?: AnyNode): string { + return this.getCellText(this.tableMetadata.docId, headerRoot) + } +} + +type AnyDocMetadataExtractor = + | MetaTagsDocMetadataExtractor + | TableRowDocMetadataExtractor + | PageHeaderTableRowDocMetadataExtractor export function createMetadataExtractor (metadata: DocMetadataSpec): AnyDocMetadataExtractor { switch (metadata.in) { @@ -81,5 +118,7 @@ export function createMetadataExtractor (metadata: DocMetadataSpec): AnyDocMetad return new MetaTagsDocMetadataExtractor(metadata) case MetadataContainer.TableRow: return new TableRowDocMetadataExtractor(metadata) + case MetadataContainer.PageHeaderTableRow: + return new PageHeaderTableRowDocMetadataExtractor(metadata) } } diff --git a/dev/doc-import-tool/src/extract/nodes.ts b/dev/doc-import-tool/src/extract/nodes.ts index fadc8faae8..c5f8766efe 100644 --- a/dev/doc-import-tool/src/extract/nodes.ts +++ b/dev/doc-import-tool/src/extract/nodes.ts @@ -103,7 +103,7 @@ export class TableNodeExtractor implements NodeExtractor { private parseRows (table: AnyDomNode): AnyNode[][] { const header = findOne((n) => n.tagName === 'thead', [table]) const body = findOne((n) => n.tagName === 'tbody', [table]) - const bodyRows = + let bodyRows = body != null ? getChildren(body).filter((n) => clean(innerText(n)) !== '') : findAll((n) => n.tagName === 'tr' && clean(innerText(n)) !== '', [table]) @@ -111,14 +111,28 @@ export class TableNodeExtractor implements NodeExtractor { if (header != null) { const firstRow = findOne((n) => n.tagName === 'tr', [header]) + if (bodyRows.length > 0) { + if (getChildren(bodyRows[0]).find((n) => n.type === ElementType.Tag && n.tagName === 'th') != null) { + bodyRows = bodyRows.slice(1) + } + } + return [ findAll((n) => n.tagName === 'th', firstRow != null ? [firstRow] : []), - ...bodyRows.map((r) => getChildren(r).filter((n) => n.type === ElementType.Tag && n.tagName === 'td')) + ...bodyRows.map((r) => + getChildren(r).filter((n) => n.type === ElementType.Tag && (n.tagName === 'td' || n.tagName === 'th')) + ) ] } else if (bodyRows.length > 0) { return [ - getChildren(bodyRows[0]).filter((n) => n.type === ElementType.Tag && n.tagName === 'td'), - ...bodyRows.slice(1).map((r) => getChildren(r).filter((n) => n.type === ElementType.Tag && n.tagName === 'td')) + getChildren(bodyRows[0]).filter( + (n) => n.type === ElementType.Tag && (n.tagName === 'td' || n.tagName === 'th') + ), + ...bodyRows + .slice(1) + .map((r) => + getChildren(r).filter((n) => n.type === ElementType.Tag && (n.tagName === 'td' || n.tagName === 'th')) + ) ] } diff --git a/dev/doc-import-tool/src/extract/types.ts b/dev/doc-import-tool/src/extract/types.ts index 2c941fbf7b..b92e320873 100644 --- a/dev/doc-import-tool/src/extract/types.ts +++ b/dev/doc-import-tool/src/extract/types.ts @@ -114,7 +114,8 @@ export type TocSectionSpec = z.infer export enum MetadataContainer { MetaTags = 'meta-tags', - TableRow = 'table-row' + TableRow = 'table-row', + PageHeaderTableRow = 'page-header-table-row' } const metaTagsMetadata = z.object({ @@ -126,9 +127,16 @@ const metaTagsMetadata = z.object({ const metadataTableCell = z.object({ extract: z.object({ row: z.number().min(0), - col: z.number().min(0) + col: z.number().min(0), + slice: z + .object({ + start: z.number().min(0).optional(), + end: z.number().min(0).optional() + }) + .optional() }) }) +export type MetadataTableCell = z.infer const tableRowMetadata = z.object({ in: z.literal(MetadataContainer.TableRow), @@ -137,11 +145,19 @@ const tableRowMetadata = z.object({ docId: metadataTableCell }) -const docMetadata = z.union([metaTagsMetadata, tableRowMetadata]) +const pageHeaderTableRowMetadata = z.object({ + in: z.literal(MetadataContainer.PageHeaderTableRow), + headerIdx: z.number().min(1).optional(), + docName: metadataTableCell, + docId: metadataTableCell +}) + +const docMetadata = z.union([metaTagsMetadata, tableRowMetadata, pageHeaderTableRowMetadata]) export type DocMetadataSpec = z.infer export type DocMetaTagsMetadata = z.infer export type DocTableRowMetadata = z.infer +export type PageHeaderTableRowMetadata = z.infer // #endregion diff --git a/dev/doc-import-tool/src/import.ts b/dev/doc-import-tool/src/import.ts index 5c63bae30c..d3e47f0514 100644 --- a/dev/doc-import-tool/src/import.ts +++ b/dev/doc-import-tool/src/import.ts @@ -41,18 +41,13 @@ import { compareStrExact, uploadFile } from './helpers' export default async function importExtractedFile (config: Config, extractedFile: ExtractedFile): Promise { const { workspaceId } = config - const token = generateToken(systemAccountEmail, workspaceId) - - const transactorUrl = await getTransactorEndpoint(token) - + const transactorUrl = await getTransactorEndpoint(token, 'external') console.log(`Connecting to transactor: ${transactorUrl} (ws: '${workspaceId.name}')`) - const connection = (await createClient(transactorUrl, token)) as CoreClient & BackupClient try { console.log(`Connected to ${transactorUrl}`) - const txops = new TxOperations(connection, core.account.System) try { @@ -73,17 +68,18 @@ async function createDocument ( config: Config ): Promise> { const { owner, space } = config - console.log('Creating document from extracted data') const templateId = await createTemplateIfNotExist(txops, extractedFile.prefix, config) - const { title, prefix } = extractedFile + const { title, prefix, oldId } = extractedFile const docId: Ref = generateId() + const ccRecordId = generateId() + const data: AttachedData = { title, prefix, - code: '', + code: oldId, seqNumber: 0, major: 0, minor: 1, @@ -95,7 +91,7 @@ async function createDocument ( reviewers: [], approvers: [], coAuthors: [], - changeControl: '' as Ref, + changeControl: ccRecordId, author: owner, owner, category: '' as Ref, @@ -103,13 +99,13 @@ async function createDocument ( effectiveDate: 0, reviewInterval: DEFAULT_PERIODIC_REVIEW_INTERVAL, content: getCollaborativeDoc(generateId()), - snapshots: 0 + snapshots: 0, + plannedEffectiveDate: 0 } - const ccRecordId = generateId() const ccRecord: Data = { description: '', - reason: '', // TODO: move to config + reason: 'Imported document', // TODO: move to config impact: '', impactedDocuments: [] } @@ -135,29 +131,28 @@ async function createTemplateIfNotExist ( ): Promise> { const { owner, space } = config - console.log(`Getting template ${prefix}`) + console.log(`Getting template with doc ${prefix}`) - const template = await txops.findOne(documents.mixin.DocumentTemplate, { prefix }) + const template = await txops.findOne(documents.mixin.DocumentTemplate, { docPrefix: prefix }) if (template != null) { return template._id } - console.log(`Creating template with prefix: ${prefix}`) + console.log(`Creating template with doc prefix: ${prefix}`) const ccRecordId = generateId() const ccRecord: Data = { description: '', - reason: '', // TODO: move to config + reason: 'Imported template', // TODO: move to config impact: '', impactedDocuments: [] } const templateId: Ref = generateId() const category = '' as Ref // TODO: move to config - const data: AttachedData = { - prefix: 'IMP', + const data = { title: 'Import template', - code: templateId, + code: '', seqNumber: 0, sections: 0, category, @@ -172,12 +167,13 @@ async function createTemplateIfNotExist ( coAuthors: [], changeControl: ccRecordId, content: getCollaborativeDoc(generateId()), - snapshots: 0 + snapshots: 0, + plannedEffectiveDate: 0 } const { success } = await createDocumentTemplate( txops, - documents.class.Document, + documents.class.ControlledDocument, space, documents.mixin.DocumentTemplate, documents.ids.NoProject, diff --git a/dev/doc-import-tool/src/index.ts b/dev/doc-import-tool/src/index.ts index 3c8105c3fc..52bb46510e 100644 --- a/dev/doc-import-tool/src/index.ts +++ b/dev/doc-import-tool/src/index.ts @@ -28,16 +28,16 @@ import { getBackend } from './convert/convert' /** * @public */ -export function docImportTool (productId: string): void { +export function docImportTool (): void { const serverSecret = process.env.SERVER_SECRET if (serverSecret === undefined) { console.error('please provide server secret') process.exit(1) } - const accountUrl = process.env.ACCOUNT_URL + const accountUrl = process.env.ACCOUNTS_URL if (accountUrl === undefined) { - console.error('please provide transactor url') + console.error('please provide account url') process.exit(1) } @@ -64,7 +64,7 @@ export function docImportTool (productId: string): void { program .command('import ') .description('import doc into workspace') - .option('-s|--spec ', 'Specification file') + .option('-s|--spec ', 'Specification file') .option('-b|--backend ', 'Conversion backend', 'pandoc') .option('--space ', 'Doc space ID', documents.space.QualityDocuments) .action( @@ -72,22 +72,22 @@ export function docImportTool (productId: string): void { doc: string, workspace: string, owner: Ref, - cmd: { backend: string, space: Ref, specFile?: string } + cmd: { backend: string, space: Ref, spec?: string } ) => { console.log( `Importing document '${doc}' into workspace '${workspace}', owner: ${JSON.stringify(owner)}, spec: ${ - cmd.specFile + cmd.spec }, space: ${cmd.space}, backend: ${cmd.backend}` ) try { - const workspaceId = getWorkspaceId(workspace, productId) + const workspaceId = getWorkspaceId(workspace) const config: Config = { doc, workspaceId, owner, backend: getBackend(cmd.backend), - specFile: cmd.specFile, + specFile: cmd.spec, space: cmd.space, uploadURL: uploadUrl, collaboratorApiURL: collaboratorApiUrl, diff --git a/dev/doc-import-tool/src/type/docx4js.d.ts b/dev/doc-import-tool/src/type/docx4js.d.ts new file mode 100644 index 0000000000..2a5b964648 --- /dev/null +++ b/dev/doc-import-tool/src/type/docx4js.d.ts @@ -0,0 +1,3 @@ +declare module 'docx4js' { + export = any +} diff --git a/dev/doc-import-tool/toc2.json b/dev/doc-import-tool/toc2.json new file mode 100644 index 0000000000..b0e02802a6 --- /dev/null +++ b/dev/doc-import-tool/toc2.json @@ -0,0 +1,34 @@ +{ + "prefix": "SOP5", + "type": "toc", + "metadata": { + "in": "page-header-table-row", + "headerIdx": 2, + "docName": { + "extract": { "row": 2, "col": 1 } + }, + "docId": { + "extract": { "row": 2, "col": 0 } + } + }, + "spec": { + "toc": { + "type": "toc", + "node": { + "type": "toc-paragraph-seq", + "params": { + "sectionHeaders": { + "tags": ["h1"] + }, + "start": { + "patterns": ["Table of contents", "Table des matières", "CONTENTS"], + "tags": ["h1", "h2", "h3", "p"] + }, + "end": { + "tags": ["h1"] + } + } + } + } + } +} diff --git a/models/controlled-documents/src/migration.ts b/models/controlled-documents/src/migration.ts index e81d4f9d7f..6bd6f27906 100644 --- a/models/controlled-documents/src/migration.ts +++ b/models/controlled-documents/src/migration.ts @@ -148,7 +148,6 @@ async function createProductChangeControlTemplate (tx: TxOperations): Promise | undefined, templateId: Ref, prefix: string, - spec: AttachedData, + spec: Omit, 'prefix'>, category: Ref, author?: Ref, defaultSection?: { title: string } @@ -224,6 +224,7 @@ export async function createDocumentTemplate ( ) const seqNumber = (incResult as any).object.sequence as number const collaborativeDocId = getCollaborativeDocForDocument('TPL-DOC', seqNumber, 0, 1) + const code = spec.code === '' ? `${TEMPLATE_PREFIX}-${seqNumber}` : spec.code let path: Array> = [] @@ -239,7 +240,7 @@ export async function createDocumentTemplate ( }) ops.notMatch(documents.class.Document, { - code: spec.code + code }) ops.notMatch(documents.mixin.DocumentTemplate, { @@ -280,6 +281,7 @@ export async function createDocumentTemplate ( 'documents', { ...spec, + code, seqNumber, category, prefix: TEMPLATE_PREFIX,