PDF importer (#7610)

Signed-off-by: Denis Bykhov <bykhov.denis@gmail.com>
This commit is contained in:
Denis Bykhov 2025-01-09 07:38:06 +05:00 committed by GitHub
parent 52e44bfb9f
commit d63db0fca6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 371 additions and 27 deletions

3
.vscode/launch.json vendored
View File

@ -584,7 +584,8 @@
"PASSWORD": "password",
"AVATAR_PATH": "./assets/avatar.png",
"AVATAR_CONTENT_TYPE": ".png",
"LOVE_ENDPOINT": "http://localhost:8096"
"STORAGE_CONFIG": "minio|localhost?accessKey=minioadmin&secretKey=minioadmin",
"LOVE_ENDPOINT": "http://localhost:8096",
},
"runtimeArgs": ["--nolazy", "-r", "ts-node/register"],
"sourceMaps": true,

View File

@ -27298,7 +27298,7 @@ packages:
dev: false
file:projects/pod-ai-bot.tgz(bufferutil@4.0.8)(utf-8-validate@6.0.4)(zod@3.23.8):
resolution: {integrity: sha512-zCUFPyseaS/JuRQ5JMtR1n8ybdRgxQzRn2aP528X1x+jOBVEH2L7VjDGBx1jsJGuUKvCJBCoOAQlCHDgfRambQ==, tarball: file:projects/pod-ai-bot.tgz}
resolution: {integrity: sha512-OPo+KhRKsPQhO1eqOIgL30ef9s/TAnEf21bAwzxLLcB70AIwMyjfPRzzJHDOIbVjCzA8t4YFQ5MWWoLUveylFg==, tarball: file:projects/pod-ai-bot.tgz}
id: file:projects/pod-ai-bot.tgz
name: '@rush-temp/pod-ai-bot'
version: 0.0.0
@ -27309,6 +27309,7 @@ packages:
'@types/jest': 29.5.12
'@types/node': 20.11.19
'@types/node-fetch': 2.6.11
'@types/uuid': 8.3.4
'@types/ws': 8.5.11
'@typescript-eslint/eslint-plugin': 6.21.0(@typescript-eslint/parser@6.21.0)(eslint@8.56.0)(typescript@5.3.3)
'@typescript-eslint/parser': 6.21.0(eslint@8.56.0)(typescript@5.3.3)
@ -27333,6 +27334,7 @@ packages:
ts-jest: 29.1.2(esbuild@0.24.2)(jest@29.7.0)(typescript@5.3.3)
ts-node: 10.9.2(@types/node@20.11.19)(typescript@5.3.3)
typescript: 5.3.3
uuid: 8.3.2
ws: 8.18.0(bufferutil@4.0.8)(utf-8-validate@6.0.4)
transitivePeerDependencies:
- '@aws-sdk/credential-providers'

View File

@ -46,6 +46,7 @@
"eslint-plugin-n": "^15.4.0",
"eslint-plugin-node": "^11.1.0",
"eslint-plugin-promise": "^6.1.1",
"@types/uuid": "^8.3.1",
"jest": "^29.7.0",
"prettier": "^3.1.0",
"ts-jest": "^29.1.1",
@ -56,6 +57,8 @@
"@hcengineering/account": "^0.6.0",
"@hcengineering/ai-bot": "^0.6.0",
"@hcengineering/analytics-collector": "^0.6.0",
"@hcengineering/document": "^0.6.0",
"@hcengineering/attachment": "^0.6.14",
"@hcengineering/chunter": "^0.6.20",
"@hcengineering/client": "^0.6.18",
"@hcengineering/client-resources": "^0.6.27",
@ -72,6 +75,8 @@
"@hcengineering/server-token": "^0.6.11",
"@hcengineering/setting": "^0.6.17",
"@hcengineering/text": "^0.6.5",
"@hcengineering/rank": "^0.6.4",
"@hcengineering/server-storage": "^0.6.0",
"@hcengineering/workbench": "^0.6.16",
"@hcengineering/love": "^0.6.0",
"cors": "^2.8.5",
@ -80,6 +85,7 @@
"fast-equals": "^5.0.1",
"form-data": "^4.0.0",
"js-tiktoken": "^1.0.14",
"uuid": "^8.3.2",
"mongodb": "^6.12.0",
"openai": "^4.56.0",
"ws": "^8.18.0"

View File

@ -36,6 +36,7 @@ interface Config {
MaxHistoryRecords: number
Port: number
LoveEndpoint: string
DataLabApiKey: string
}
const parseNumber = (str: string | undefined): number | undefined => (str !== undefined ? Number(str) : undefined)
@ -61,7 +62,8 @@ const config: Config = (() => {
MaxContentTokens: parseNumber(process.env.MAX_CONTENT_TOKENS) ?? 128 * 100,
MaxHistoryRecords: parseNumber(process.env.MAX_HISTORY_RECORDS) ?? 500,
Port: parseNumber(process.env.PORT) ?? 4010,
LoveEndpoint: process.env.LOVE_ENDPOINT ?? ''
LoveEndpoint: process.env.LOVE_ENDPOINT ?? '',
DataLabApiKey: process.env.DATALAB_API_KEY ?? ''
}
const missingEnv = (Object.keys(params) as Array<keyof Config>).filter((key) => params[key] === undefined)

View File

@ -29,22 +29,24 @@ import {
TranslateRequest,
TranslateResponse
} from '@hcengineering/ai-bot'
import { Markup, MeasureContext, Ref, WorkspaceId } from '@hcengineering/core'
import { Room } from '@hcengineering/love'
import { WorkspaceInfoRecord } from '@hcengineering/server-ai-bot'
import { getTransactorEndpoint } from '@hcengineering/server-client'
import { generateToken } from '@hcengineering/server-token'
import OpenAI from 'openai'
import { encodingForModel } from 'js-tiktoken'
import { htmlToMarkup, markupToHTML } from '@hcengineering/text'
import { Markup, MeasureContext, Ref, WorkspaceId } from '@hcengineering/core'
import { Room } from '@hcengineering/love'
import { encodingForModel } from 'js-tiktoken'
import OpenAI from 'openai'
import { WorkspaceClient } from './workspace/workspaceClient'
import { StorageAdapter } from '@hcengineering/server-core'
import { buildStorageFromConfig, storageConfigFromEnv } from '@hcengineering/server-storage'
import config from './config'
import { DbStorage } from './storage'
import { SupportWsClient } from './workspace/supportWsClient'
import { AIReplyTransferData } from './types'
import { tryAssignToWorkspace } from './utils/account'
import { translateHtml } from './utils/openai'
import { SupportWsClient } from './workspace/supportWsClient'
import { WorkspaceClient } from './workspace/workspaceClient'
const CLOSE_INTERVAL_MS = 10 * 60 * 1000 // 10 minutes
@ -54,6 +56,7 @@ export class AIControl {
private readonly connectingWorkspaces = new Map<string, Promise<void>>()
readonly aiClient?: OpenAI
readonly storageAdapter: StorageAdapter
readonly encoding = encodingForModel(config.OpenAIModel)
supportClient: SupportWsClient | undefined = undefined
@ -70,6 +73,7 @@ export class AIControl {
})
: undefined
void this.connectSupportWorkspace()
this.storageAdapter = buildStorageFromConfig(storageConfigFromEnv())
}
async getWorkspaceRecord (workspace: string): Promise<WorkspaceInfoRecord> {
@ -125,10 +129,26 @@ export class AIControl {
this.ctx.info('Listen workspace: ', { workspace })
if (workspace === config.SupportWorkspace) {
return new SupportWsClient(endpoint, token, workspace, this, this.ctx.newChild(workspace, {}), info)
return new SupportWsClient(
this.storageAdapter,
endpoint,
token,
workspace,
this,
this.ctx.newChild(workspace, {}),
info
)
}
return new WorkspaceClient(endpoint, token, workspace, this, this.ctx.newChild(workspace, {}), info)
return new WorkspaceClient(
this.storageAdapter,
endpoint,
token,
workspace,
this,
this.ctx.newChild(workspace, {}),
info
)
}
async initWorkspaceClient (workspace: string): Promise<void> {

View File

@ -13,12 +13,13 @@
// limitations under the License.
//
import OpenAI from 'openai'
import { countTokens } from '@hcengineering/openai'
import { Tiktoken } from 'js-tiktoken'
import OpenAI from 'openai'
import config from '../config'
import { HistoryRecord } from '../types'
import { WorkspaceClient } from '../workspace/workspaceClient'
import { getTools } from './tools'
export async function translateHtml (client: OpenAI, html: string, lang: string): Promise<string | undefined> {
const response = await client.chat.completions.create({
@ -66,6 +67,58 @@ export async function createChatCompletion (
return undefined
}
export async function createChatCompletionWithTools (
workspaceClient: WorkspaceClient,
client: OpenAI,
message: OpenAI.ChatCompletionMessageParam,
user?: string,
history: OpenAI.ChatCompletionMessageParam[] = [],
skipCache = true
): Promise<
| {
completion: string | undefined
usage: number
}
| undefined
> {
const opt: OpenAI.RequestOptions = {}
if (skipCache) {
opt.headers = { 'cf-skip-cache': 'true' }
}
try {
const res = client.beta.chat.completions
.runTools(
{
messages: [
{
role: 'system',
content: 'Use tools if possible, don`t use previous information after success using tool for user request'
},
...history,
message
],
model: config.OpenAIModel,
user,
tools: getTools(workspaceClient, user)
},
opt
)
.on('message', (message) => {
console.log(message)
})
const str = await res.finalContent()
const usage = (await res.totalUsage()).completion_tokens
return {
completion: str ?? undefined,
usage
}
} catch (e) {
console.error(e)
}
return undefined
}
export async function requestSummary (
aiClient: OpenAI,
encoding: Tiktoken,

View File

@ -0,0 +1,243 @@
import { Account, MarkupBlobRef, Ref } from '@hcengineering/core'
import document, { Document, getFirstRank, Teamspace } from '@hcengineering/document'
import { makeRank } from '@hcengineering/rank'
import { parseMessageMarkdown } from '@hcengineering/text'
import {
BaseFunctionsArgs,
RunnableFunctionWithoutParse,
RunnableFunctionWithParse,
RunnableToolFunction,
RunnableToolFunctionWithoutParse,
RunnableToolFunctionWithParse,
RunnableTools
} from 'openai/lib/RunnableFunction'
import { Stream } from 'stream'
import { v4 as uuid } from 'uuid'
import config from '../config'
import { WorkspaceClient } from '../workspace/workspaceClient'
async function stream2buffer (stream: Stream): Promise<Buffer> {
return await new Promise<Buffer>((resolve, reject) => {
const _buf = Array<any>()
stream.on('data', (chunk) => {
_buf.push(chunk)
})
stream.on('end', () => {
resolve(Buffer.concat(_buf))
})
stream.on('error', (err) => {
reject(new Error(`error converting stream - ${err}`))
})
})
}
async function pdfToMarkdown (
workspaceClient: WorkspaceClient,
fileId: string,
name: string | undefined
): Promise<string | undefined> {
if (config.DataLabApiKey !== '') {
try {
const stat = await workspaceClient.storage.stat(workspaceClient.ctx, { name: workspaceClient.workspace }, fileId)
if (stat?.contentType !== 'application/pdf') {
return
}
const file = await workspaceClient.storage.get(workspaceClient.ctx, { name: workspaceClient.workspace }, fileId)
const buffer = await stream2buffer(file)
const url = 'https://www.datalab.to/api/v1/marker'
const formData = new FormData()
formData.append('file', new Blob([buffer], { type: 'application/pdf' }), name ?? 'test.pdf')
formData.append('force_ocr', 'false')
formData.append('paginate', 'false')
formData.append('output_format', 'markdown')
formData.append('use_llm', 'false')
formData.append('strip_existing_ocr', 'false')
formData.append('disable_image_extraction', 'false')
const headers = { 'X-Api-Key': config.DataLabApiKey }
const response = await fetch(url, {
method: 'POST',
body: formData,
headers
})
const data = await response.json()
console.log('data', data)
if (data.request_check_url !== undefined) {
for (let attempt = 0; attempt < 10; attempt++) {
const resp = await fetch(data.request_check_url, { headers })
const result = await resp.json()
if (result.status === 'complete' && result.markdown !== undefined) {
return result.markdown
}
await new Promise((resolve) => setTimeout(resolve, 2000))
}
}
} catch (e) {
console.error(e)
}
}
}
async function saveFile (
workspaceClient: WorkspaceClient,
user: string | undefined,
args: { fileId: string, folder: string | undefined, parent: string | undefined, name: string }
): Promise<string> {
console.log('Save file', args)
const content = await pdfToMarkdown(workspaceClient, args.fileId, args.name)
if (content === undefined) {
return 'Error while converting pdf to markdown'
}
const converted = JSON.stringify(parseMessageMarkdown(content, 'image://'))
const client = await workspaceClient.opClient
const fileId = uuid()
await workspaceClient.storage.put(
workspaceClient.ctx,
{ name: workspaceClient.workspace },
fileId,
converted,
'application/json'
)
const teamspaces = await client.findAll(document.class.Teamspace, {})
const parent = await client.findOne(document.class.Document, { _id: args.parent as Ref<Document> })
const teamspaceId = getTeamspace(args.folder, parent, teamspaces)
const parentId = parent?._id ?? document.ids.NoParent
const lastRank = await getFirstRank(client, teamspaceId, parentId)
const rank = makeRank(lastRank, undefined)
const _id = await client.createDoc(document.class.Document, teamspaceId, {
title: args.name,
parent: parentId,
content: fileId as MarkupBlobRef,
rank
})
return `File saved as ${args.name} with id ${_id}, always provide mention link as: [](ref://?_class=document%3Aclass%3ADocument&_id=${_id}&label=${args.name})`
}
function getTeamspace (
folder: string | undefined,
parent: Document | undefined,
teamspaces: Teamspace[]
): Ref<Teamspace> {
if (parent !== undefined) return parent.space
if (folder !== undefined) {
const teamspace = teamspaces.find(
(p) => p.name.trim().toLowerCase() === folder.trim().toLowerCase() || p._id === folder
)
if (teamspace !== undefined) return teamspace._id
}
return teamspaces[0]._id
}
async function getFoldersForDocuments (
workspaceClient: WorkspaceClient,
user: string | undefined,
args: Record<string, any>
): Promise<string> {
const client = await workspaceClient.opClient
const spaces = await client.findAll(
document.class.Teamspace,
user !== undefined ? { members: user as Ref<Account>, archived: false } : { archived: false }
)
let res = 'Folders:\n'
for (const space of spaces) {
res += `Id: ${space._id} Name: ${space.name}\n`
}
res += 'Parents:\n'
const parents = await client.findAll(document.class.Document, { space: { $in: spaces.map((p) => p._id) } })
for (const parent of parents) {
res += `Id: ${parent._id} Name: ${parent.title}\n`
}
return res
}
type ChangeFields<T, R> = Omit<T, keyof R> & R
type PredefinedTool<T extends object | string> = ChangeFields<
RunnableToolFunction<T>,
{
function: PredefinedToolFunction<T>
}
>
type PredefinedToolFunction<T extends object | string> = Omit<
T extends string ? RunnableFunctionWithoutParse : RunnableFunctionWithParse<any>,
'function'
>
type ToolFunc = (workspaceClient: WorkspaceClient, user: string | undefined, args: any) => Promise<string> | string
const tools: [PredefinedTool<any>, ToolFunc][] = []
export function registerTool<T extends object | string> (tool: PredefinedTool<T>, func: ToolFunc): void {
tools.push([tool, func])
}
registerTool(
{
type: 'function',
function: {
name: 'getDataBeforeImport',
parameters: {
type: 'object',
properties: {}
},
description:
'Get folders and parents for documents. This step necessery before saveFile tool. YOU MUST USE IT BEFORE import file.'
}
},
getFoldersForDocuments
)
registerTool<object>(
{
type: 'function',
function: {
name: 'saveFile',
parse: JSON.parse,
parameters: {
type: 'object',
required: ['fileId, folder, name'],
properties: {
fileId: { type: 'string', description: 'File id to parse' },
folder: {
type: 'string',
default: '',
description:
'Folder, id from getDataBeforeImport. If not provided you can guess by file name and folder name, or by another file names, if you can`t, just ask user. Don`t provide empty, this field is required. If no folders at all, you should stop pipeline execution and ask user to create teamspace'
},
parent: {
type: 'string',
default: '',
description:
'Parent document, use id from getDataBeforeImport, leave empty string if not provided, it is not necessery, please feel free to pass empty string'
},
name: {
type: 'string',
description: 'Name for file, try to recognize from user input, if not provided use attached file name'
}
}
},
description:
'Parse pdf to markdown and save it, using for import files. Use only if provide file in current message and user require to import/save, if file not provided ask user to attach it. You MUST call getDataBeforeImport tool before for get ids. Use file name as name if user not provide it, don`t use old parameters. You can ask user about folder if you have not enough data to get folder id'
}
},
saveFile
)
export function getTools (workspaceClient: WorkspaceClient, user: string | undefined): RunnableTools<BaseFunctionsArgs> {
const result: (RunnableToolFunctionWithoutParse | RunnableToolFunctionWithParse<any>)[] = []
for (const tool of tools) {
const res: RunnableToolFunctionWithoutParse | RunnableToolFunctionWithParse<any> = {
...tool[0],
function: {
...tool[0].function,
function: (args: any) => tool[1](workspaceClient, user, args)
}
}
result.push(res)
}
return result
}

View File

@ -22,6 +22,7 @@ import aiBot, {
IdentityResponse
} from '@hcengineering/ai-bot'
import analyticsCollector, { OnboardingChannel } from '@hcengineering/analytics-collector'
import attachment, { Attachment } from '@hcengineering/attachment'
import chunter, {
ChatMessage,
type ChatWidgetTab,
@ -59,7 +60,7 @@ import { Room } from '@hcengineering/love'
import { countTokens } from '@hcengineering/openai'
import { WorkspaceInfoRecord } from '@hcengineering/server-ai-bot'
import { getOrCreateOnboardingChannel } from '@hcengineering/server-analytics-collector-resources'
import { BlobClient, login } from '@hcengineering/server-client'
import { login } from '@hcengineering/server-client'
import { generateToken } from '@hcengineering/server-token'
import { jsonToMarkup, MarkdownParser, markupToText } from '@hcengineering/text'
import workbench, { SidebarEvent, TxSidebarEvent } from '@hcengineering/workbench'
@ -67,10 +68,11 @@ import fs from 'fs'
import { WithId } from 'mongodb'
import OpenAI from 'openai'
import { StorageAdapter } from '@hcengineering/server-core'
import config from '../config'
import { AIControl } from '../controller'
import { HistoryRecord } from '../types'
import { createChatCompletion, requestSummary } from '../utils/openai'
import { createChatCompletionWithTools, requestSummary } from '../utils/openai'
import { connectPlatform, getDirect } from '../utils/platform'
import { LoveController } from './love'
@ -81,8 +83,6 @@ export class WorkspaceClient {
client: Client | undefined
opClient: Promise<TxOperations> | TxOperations
blobClient: BlobClient
loginTimeout: NodeJS.Timeout | undefined
loginDelayMs = 2 * 1000
@ -103,6 +103,7 @@ export class WorkspaceClient {
love: LoveController | undefined
constructor (
readonly storage: StorageAdapter,
readonly transactorUrl: string,
readonly token: string,
readonly workspace: string,
@ -110,7 +111,6 @@ export class WorkspaceClient {
readonly ctx: MeasureContext,
readonly info: WorkspaceInfoRecord | undefined
) {
this.blobClient = new BlobClient(transactorUrl, token, { name: this.workspace })
this.opClient = this.initClient()
void this.opClient.then((opClient) => {
this.opClient = opClient
@ -154,7 +154,14 @@ export class WorkspaceClient {
if (!isAlreadyUploaded) {
const data = fs.readFileSync(config.AvatarPath)
await this.blobClient.upload(this.ctx, config.AvatarName, data.length, config.AvatarContentType, data)
await this.storage.put(
this.ctx,
{ name: this.workspace },
config.AvatarName,
data,
config.AvatarContentType,
data.length
)
await this.controller.updateAvatarInfo(this.workspace, config.AvatarPath, lastModified)
this.ctx.info('Avatar file uploaded successfully', { workspace: this.workspace, path: config.AvatarPath })
}
@ -209,9 +216,9 @@ export class WorkspaceClient {
return
}
const exist = await this.blobClient.checkFile(this.ctx, config.AvatarName)
const exist = await this.storage.stat(this.ctx, { name: this.workspace }, config.AvatarName)
if (!exist) {
if (exist === undefined) {
this.ctx.error('Cannot find file', { file: config.AvatarName, workspace: this.workspace })
return
}
@ -449,11 +456,23 @@ export class WorkspaceClient {
this.historyMap.set(objectId, currentHistory)
}
async getAttachments (client: TxOperations, objectId: Ref<Doc>): Promise<Attachment[]> {
return await client.findAll(attachment.class.Attachment, { attachedTo: objectId })
}
async processMessageEvent (event: AIMessageEventRequest): Promise<void> {
if (this.controller.aiClient === undefined) return
const { user, objectId, objectClass, messageClass } = event
const promptText = markupToText(event.message)
const client = await this.opClient
let promptText = markupToText(event.message)
const files = await this.getAttachments(client, event.messageId)
if (files.length > 0) {
promptText += '\n\nAttachments:'
for (const file of files) {
promptText += `\nName:${file.name} FileId:${file.file} Type:${file.type}`
}
}
const prompt: OpenAI.ChatCompletionMessageParam = { content: promptText, role: 'user' }
const promptTokens = countTokens([prompt], this.controller.encoding)
@ -462,7 +481,6 @@ export class WorkspaceClient {
return
}
const client = await this.opClient
const op = client.apply(undefined, 'AIMessageRequestEvent')
const hierarchy = client.getHierarchy()
@ -479,16 +497,15 @@ export class WorkspaceClient {
void this.pushHistory(promptText, prompt.role, promptTokens, user, objectId, objectClass)
const chatCompletion = await createChatCompletion(this.controller.aiClient, prompt, user, history)
const response = chatCompletion?.choices[0].message.content
const chatCompletion = await createChatCompletionWithTools(this, this.controller.aiClient, prompt, user, history)
const response = chatCompletion?.completion
if (response == null) {
await this.finishTyping(client, objectId)
return
}
const responseTokens =
chatCompletion?.usage?.completion_tokens ??
countTokens([{ content: response, role: 'assistant' }], this.controller.encoding)
chatCompletion?.usage ?? countTokens([{ content: response, role: 'assistant' }], this.controller.encoding)
void this.pushHistory(response, 'assistant', responseTokens, user, objectId, objectClass)