platform/server/core/src/indexer/summary.ts

//
// Copyright © 2022 Hardcore Engineering Inc.
//
// Licensed under the Eclipse Public License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. You may
// obtain a copy of the License at https://www.eclipse.org/legal/epl-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//
// See the License for the specific language governing permissions and
// limitations under the License.
//

import core, {
  type AnyAttribute,
  type Class,
  type Doc,
  type DocIndexState,
  type DocumentQuery,
  type DocumentUpdate,
  extractDocKey,
  type Hierarchy,
  type IndexStageState,
  isFullTextAttribute,
  type MeasureContext,
  type Ref,
  type ServerStorage,
  getFullTextContext
} from '@hcengineering/core'
import { translate } from '@hcengineering/platform'
import { jsonToText, markupToJSON } from '@hcengineering/text'
import { type DbAdapter } from '../adapter'
import { type IndexedDoc } from '../types'
import {
  contentStageId,
  type DocUpdateHandler,
  fieldStateId,
  type FullTextPipeline,
  type FullTextPipelineStage
} from './types'
import { collectPropagate, collectPropagateClasses, isCustomAttr, loadIndexStageStage } from './utils'

/**
 * @public
 */
export const summaryStageId = 'sum-v5'

/**
 * @public
 */
export class FullSummaryStage implements FullTextPipelineStage {
  require = [fieldStateId, contentStageId]
  stageId = summaryStageId

  enabled = true

  clearExcept?: string[] = undefined

  updateFields: DocUpdateHandler[] = []

  // If specified, index only fields with content specified.
  matchExtra: string[] = [] // 'content', 'base64'] // '#en'

  fieldFilter: ((attr: AnyAttribute, value: string) => boolean)[] = []

  stageValue: boolean | string = true

  indexState?: IndexStageState

  // Summary should be not a bigger what 1mb of data.
  summaryLimit = 1024 * 1024

  constructor (private readonly dbStorage: ServerStorage) {}

  async initialize (ctx: MeasureContext, storage: DbAdapter, pipeline: FullTextPipeline): Promise<void> {
    const indexable = (
      await pipeline.model.findAll(core.class.Class, { [core.mixin.FullTextSearchContext]: { $exists: true } })
    )
      .map((it) => pipeline.hierarchy.as(it, core.mixin.FullTextSearchContext))
      .filter((it) => it.fullTextSummary)
      .map((it) => it._id + (it.propagateClasses ?? []).join('|'))
    indexable.sort()
    ;[this.stageValue, this.indexState] = await loadIndexStageStage(
      ctx,
      storage,
      this.indexState,
      this.stageId,
      'config',
      {
        classes: indexable,
        matchExtra: this.matchExtra
      }
    )
  }

  async search (
    _classes: Ref<Class<Doc>>[],
    search: DocumentQuery<Doc>,
    size?: number,
    from?: number
  ): Promise<{ docs: IndexedDoc[], pass: boolean }> {
    return { docs: [], pass: true }
  }

  async collect (toIndex: DocIndexState[], pipeline: FullTextPipeline, metrics: MeasureContext): Promise<void> {
    const part = [...toIndex]
    while (part.length > 0) {
      const toIndexPart = part.splice(0, 1000)

      const kids = toIndexPart.map((it) => it._id)
      const allChildDocs = await metrics.with(
        'find-child',
        {},
        async (ctx) =>
          await this.dbStorage.findAll(ctx, core.class.DocIndexState, {
            attachedTo: kids.length === 1 ? kids[0] : { $in: kids }
          })
      )

      for (const doc of toIndexPart) {
        if (pipeline.cancelling) {
          return
        }

        const needIndex = isIndexingRequired(pipeline, doc)

        // No need to index this class, mark embeddings as empty ones.
        if (!needIndex) {
          await pipeline.update(doc._id, this.stageValue, {})
          continue
        }

        const update: DocumentUpdate<DocIndexState> = {}

        let embeddingText = await extractIndexedValues(doc, pipeline.hierarchy, {
          matchExtra: this.matchExtra,
          fieldFilter: this.fieldFilter
        })

        // Include all child attributes
        const childDocs = allChildDocs.filter((it) => it.attachedTo === doc._id)
        if (childDocs.length > 0) {
          for (const c of childDocs) {
            const ctx = getFullTextContext(pipeline.hierarchy, c.objectClass)
            if (ctx.parentPropagate ?? true) {
              if (embeddingText.length > this.summaryLimit) {
                break
              }
              embeddingText +=
                '\n' +
                (await extractIndexedValues(c, pipeline.hierarchy, {
                  matchExtra: this.matchExtra,
                  fieldFilter: this.fieldFilter
                }))
            }
          }
        }

        if (doc.attachedToClass != null && doc.attachedTo != null) {
          const propagate: Ref<Class<Doc>>[] = collectPropagate(pipeline, doc.attachedToClass)
          if (propagate.some((it) => pipeline.hierarchy.isDerived(doc.objectClass, it))) {
            // We need to include all parent content into this one.
            const [parentDoc] = await this.dbStorage.findAll(
              metrics.newChild('propagate', {}),
              core.class.DocIndexState,
              { _id: doc.attachedTo as Ref<DocIndexState> },
              { limit: 1 }
            )
            if (parentDoc !== undefined) {
              const ctx = collectPropagateClasses(pipeline, parentDoc.objectClass)
              if (ctx.length > 0) {
                const collections = await this.dbStorage.findAll(
                  metrics.newChild('propagate', {}),
                  core.class.DocIndexState,
                  { attachedTo: parentDoc._id, objectClass: ctx.length === 1 ? ctx[0] : { $in: ctx } }
                )
                for (const c of collections) {
                  embeddingText +=
                    '\n' +
                    (await extractIndexedValues(c, pipeline.hierarchy, {
                      matchExtra: this.matchExtra,
                      fieldFilter: this.fieldFilter
                    }))
                }
              }

              if (embeddingText.length > this.summaryLimit) {
                break
              }
              embeddingText +=
                '\n' +
                (await extractIndexedValues(parentDoc, pipeline.hierarchy, {
                  matchExtra: this.matchExtra,
                  fieldFilter: this.fieldFilter
                }))
            }
          }
        }

        update.fullSummary = embeddingText

        await pipeline.update(doc._id, this.stageValue, update)
      }
    }
  }

  async remove (docs: DocIndexState[], pipeline: FullTextPipeline): Promise<void> {
    // will be handled by field processor
    for (const doc of docs) {
      await pipeline.update(doc._id, this.stageValue, {})
    }
  }
}

/**
 * @public
 */
export function isIndexingRequired (pipeline: FullTextPipeline, doc: DocIndexState): boolean {
  return getFullTextContext(pipeline.hierarchy, doc.objectClass).fullTextSummary ?? false
}

/**
 * @public
 */
export async function extractIndexedValues (
  doc: DocIndexState,
  hierarchy: Hierarchy,
  opt: {
    matchExtra: string[]
    fieldFilter: ((attr: AnyAttribute, value: string) => boolean)[]
  }
): Promise<string> {
  const attributes: Record<Ref<Class<Doc>>, Record<string, string>> = {}
  const childAttributes: Record<Ref<Class<Doc>>, Record<string, string>> = {}
  const currentReplacement: Record<string, string> = {}

  for (const [k, v] of Object.entries(doc.attributes)) {
    if (v == null) {
      continue
    }
    try {
      const { _class, attr, extra, docId } = extractDocKey(k)
      if (docId !== undefined) {
        continue
      }

      let sourceContent = `${v as string}`.trim()
      if (extra.includes('base64')) {
        sourceContent = Buffer.from(sourceContent, 'base64').toString().trim()
      }
      if (sourceContent.length === 0) {
        continue
      }

      if (isCustomAttr(attr)) {
        const str = v
          .map((pair: { label: string, value: string }) => {
            return `${pair.label} is ${pair.value}`
          })
          .join(' ')
        const cl = doc.objectClass
        attributes[cl] = { ...attributes[cl], [k]: str }
      }

      if (_class === undefined) {
        // Skip all helper fields.
        continue
      }

      if (!opt.matchExtra.every((it) => extra.includes(it))) {
        continue
      }
      // Check if attribute is indexable
      const keyAttr: AnyAttribute | undefined = hierarchy.findAttribute(_class, attr)
      if (keyAttr === undefined) {
        // Skip if there is no attribute.
        continue
      }

      if (keyAttr.type._class === core.class.TypeMarkup || keyAttr.type._class === core.class.TypeCollaborativeMarkup) {
        sourceContent = jsonToText(markupToJSON(sourceContent))
      }

      if (!opt.fieldFilter.every((it) => it(keyAttr, sourceContent))) {
        // Some of filters not pass value
        continue
      }

      if (!isFullTextAttribute(keyAttr)) {
        continue
      }
      if (keyAttr.type._class === core.class.TypeAttachment && extra.length === 0) {
        // Skip attachment id values.
        continue
      }

      const repl = extra.join('#')

      if ((currentReplacement[attr] ?? '').length <= repl.length) {
        const label = await translate(keyAttr.label, {})
        const cl = _class ?? doc.objectClass

        if (docId === undefined) {
          attributes[cl] = { ...attributes[cl], [k]: `${label} is ${sourceContent}\n` }
        } else {
          childAttributes[cl] = { ...childAttributes[cl], [k]: sourceContent }
        }
        currentReplacement[attr] = repl
      }
    } catch (err: any) {
      console.log(err)
    }
  }
  let embeddingText = ''

  for (const [, vv] of Object.entries(attributes)) {
    embeddingText += '\n'
    for (const [, v] of Object.entries(vv)) {
      // Check if attribute is text one.
      embeddingText += ' ' + v + '\n'
    }
  }

  // Extra child attributes
  for (const [, vv] of Object.entries(childAttributes)) {
    for (const [, v] of Object.entries(vv)) {
      // Check if attribute is text one.
      embeddingText += ' ' + v + '\n'
    }
  }

  // Trim empty inner space.
  embeddingText = (embeddingText ?? '')
    .split(/  +|\t+/)
    .filter((it) => it)
    .join(' ')
  embeddingText = (embeddingText ?? '')
    .split(/\n\n+/)
    .filter((it) => it)
    .join('\n\n')
  return embeddingText.trim()
}