UBERF-11156: Decode encoded mail content and subject (#9157)

Signed-off-by: Artem Savchenko <armisav@gmail.com>
This commit is contained in:
Artyom Savchenko 2025-06-03 16:19:33 +07:00 committed by GitHub
parent 9c04340976
commit 67fc7dd0c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 491 additions and 31 deletions

View File

@ -0,0 +1,81 @@
{
"envelope": {
"from": {
"address": "example1@test.com"
},
"to": [
{
"address": "recipient2@example.com"
}
]
},
"message": {
"contents": "VGVzdCBlbmNvZGVkIGVtYWlsIGNvbnRlbnQ=",
"headers": [
[
"Received",
" from mail-nwsmtp-mxback-production-main-38.iva.yp-c.yandex.net (mail-nwsmtp-mxback-production-main-38.iva.yp-c.yandex.net [IPv6:2a02:6b8:c0c:1724:0:640:dee6:0])\r\n\tby forward500b.mail.yandex.net (Yandex) with ESMTPS id 65A23611AA\r\n\tfor <artyom@huly.app>; Tue, 3 Jun 2025 06:35:44 +0300 (MSK)\r\n"
],
[
"Received",
" from mail.yandex.ru (2a02:6b8:c0c:b187:0:640:6b88:0 [2a02:6b8:c0c:b187:0:640:6b88:0])\r\n\tby mail-nwsmtp-mxback-production-main-38.iva.yp-c.yandex.net (mxback/Yandex) with HTTPS id YZRhuVDL1eA0-m7PJdtq8;\r\n\tTue, 03 Jun 2025 06:35:44 +0300\r\n"
],
[
"X-Yandex-Fwd",
" 1\r\n"
],
[
"DKIM-Signature",
" v=1; a=rsa-sha256; c=relaxed/relaxed; d=yandex.ru; s=mail;\r\n\tt=1748921744; bh=DnseDjFgmtsB1kN2sgMKhzeGZ1TcOQm0aEN3ux6v8k0=;\r\n\th=Message-Id:Date:Subject:In-Reply-To:To:From;\r\n\tb=mOwsmrIzUHfrnHY6fjABtgU2IHkXKyHjoEmbKNGHPkFFdq9fqtNiw7rwX7HYJIFwN\r\n\t Jx9ZGkGNLpDGElAXs67xexAp6t/mAebaInU/5/C7nJd8YMlkauUGTKmQDD4rtOrBSG\r\n\t 2LAfXrsAyEVaeqnIjhNEir+sAWHyA1+kDPpA4jCc=\r\n"
],
[
"Authentication-Results",
" mail-nwsmtp-mxback-production-main-38.iva.yp-c.yandex.net; dkim=pass header.i=@yandex.ru\r\n"
],
[
"Received",
" by qvxj4z7i6zm4ub2j.iva.yp-c.yandex.net with HTTP;\r\n\tTue, 03 Jun 2025 06:35:43 +0300\r\n"
],
[
"From",
" =?utf-8?B?RXhhbXBsZSBVc2VyMQ==?= <example1@test.com>\r\n"
],
[
"To",
" \"Example Recipient2\" <recipient2@example.com>\r\n"
],
[
"In-Reply-To",
" YUvuZoQ3ypgAAAAAAfVzrQAAAAAAAAQu\r\n"
],
[
"Subject",
" =?utf-8?B?VGhpcyBpcyBlbmNvZGVkIGVtYWlsIHN1YmplY3Q=?=\r\n"
],
[
"MIME-Version",
" 1.0\r\n"
],
[
"X-Mailer",
" Yamail [ http://yandex.ru ] 5.0\r\n"
],
[
"Date",
" Tue, 03 Jun 2025 06:35:43 +0300\r\n"
],
[
"Message-Id",
" <1296141748921736@mail.yandex.ru>\r\n"
],
[
"Content-Transfer-Encoding",
" base64\r\n"
],
[
"Content-Type",
" text/html; charset=utf-8\r\n"
]
]
}
}

View File

@ -0,0 +1,230 @@
//
// Copyright © 2025 Hardcore Engineering Inc.
//
// Licensed under the Eclipse Public License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. You may
// obtain a copy of the License at https://www.eclipse.org/legal/epl-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//
// See the License for the specific language governing permissions and
// limitations under the License.
//
import { decodeContent, decodeEncodedWords } from '../decode'
import { MeasureContext } from '@hcengineering/core'
jest.mock(
'../config',
() => ({
hookToken: 'test-hook-token',
ignoredAddresses: ['ignored@example.com'],
storageConfig: 'test-storage-config',
workspaceUrl: 'test-workspace'
}),
{ virtual: true }
)
const mockCtx: MeasureContext = {
info: jest.fn(),
error: jest.fn(),
warn: jest.fn()
} as any
describe('decodeContent', () => {
test('should return original content when encoding is undefined', () => {
const content = 'Hello World'
const result = decodeContent(mockCtx, content, undefined)
expect(result).toBe(content)
})
test('should return original content when encoding is empty string', () => {
const content = 'Hello World'
const result = decodeContent(mockCtx, content, '')
expect(result).toBe(content)
})
test('should decode base64 content', () => {
const base64Content = 'SGVsbG8gV29ybGQ=' // "Hello World" in base64
const result = decodeContent(mockCtx, base64Content, 'base64')
expect(result).toBe('Hello World')
})
test('should decode base64 content with case insensitive encoding', () => {
const base64Content = 'SGVsbG8gV29ybGQ='
const result = decodeContent(mockCtx, base64Content, 'BASE64')
expect(result).toBe('Hello World')
})
test('should decode quoted-printable content', () => {
const qpContent = 'Hello=20World=21'
const result = decodeContent(mockCtx, qpContent, 'quoted-printable')
expect(result).toBe('Hello World!')
})
test('should handle quoted-printable with soft line breaks', () => {
const qpContent = 'This is a very long line that needs=\r\nto be wrapped'
const result = decodeContent(mockCtx, qpContent, 'quoted-printable')
expect(result).toBe('This is a very long line that needsto be wrapped')
})
test('should return original content for 7bit encoding', () => {
const content = 'Plain text content'
const result = decodeContent(mockCtx, content, '7bit')
expect(result).toBe(content)
})
test('should return original content for 8bit encoding', () => {
const content = 'Plain text with émojis 🎉'
const result = decodeContent(mockCtx, content, '8bit')
expect(result).toBe(content)
})
test('should return original content for binary encoding', () => {
const content = 'Binary content'
const result = decodeContent(mockCtx, content, 'binary')
expect(result).toBe(content)
})
test('should return original content for unknown encoding', () => {
const content = 'Unknown encoding content'
const result = decodeContent(mockCtx, content, 'unknown-encoding')
expect(result).toBe(content)
})
})
describe('decodeEncodedWords', () => {
test('should return original text when no encoded words present', () => {
const text = 'Plain text without encoding'
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe(text)
})
test('should decode base64 encoded word', () => {
const text = '=?utf-8?B?SGVsbG8gV29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should decode quoted-printable encoded word', () => {
const text = '=?utf-8?Q?Hello=20World?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should decode quoted-printable with underscores as spaces', () => {
const text = '=?utf-8?Q?Hello_World?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle multiple encoded words in same text', () => {
const text = '=?utf-8?B?SGVsbG8=?= =?utf-8?B?V29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle mixed encoded and plain text', () => {
const text = 'Subject: =?utf-8?B?SGVsbG8=?= from sender'
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Subject: Hello from sender')
})
test('should handle case insensitive encoding (lowercase b)', () => {
const text = '=?utf-8?b?SGVsbG8gV29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle case insensitive encoding (lowercase q)', () => {
const text = '=?utf-8?q?Hello_World?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle unknown encoding gracefully', () => {
const text = '=?utf-8?X?unknown?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe(text) // Should return original
})
test('should decode real-world email subject', () => {
const text = '=?UTF-8?B?8J+OiSBXZWxjb21lIHRvIG91ciBwbGF0Zm9ybSE=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('🎉 Welcome to our platform!')
})
test('should handle empty encoded text', () => {
const text = '=?utf-8?B??='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('')
})
test('should handle different charset - ISO-8859-1', () => {
const text = '=?iso-8859-1?B?SGVsbG8gV29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle different charset - latin1', () => {
const text = '=?latin1?B?SGVsbG8gV29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle different charset - windows-1252', () => {
const text = '=?windows-1252?B?SGVsbG8gV29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle ASCII charset', () => {
const text = '=?us-ascii?B?SGVsbG8gV29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle case insensitive charset names', () => {
const text = '=?UTF-8?B?SGVsbG8gV29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle charset with whitespace', () => {
const text = '=? utf-8 ?B?SGVsbG8gV29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should default to utf8 for unsupported charset', () => {
const text = '=?gb2312?B?SGVsbG8gV29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World') // Should still decode as utf8
})
test('should handle mixed charsets in same text', () => {
const text = '=?utf-8?B?SGVsbG8=?= =?iso-8859-1?B?V29ybGQ=?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('Hello World')
})
test('should handle quoted-printable with different charset', () => {
const text = '=?iso-8859-1?Q?caf=E9?='
const result = decodeEncodedWords(mockCtx, text)
expect(result).toBe('café')
})
test('should handle error in charset conversion gracefully', () => {
const consoleSpy = jest.spyOn(mockCtx, 'warn')
// This might cause an encoding issue depending on the content
const text = '=?invalid-charset?B?invalid-content?='
const result = decodeEncodedWords(mockCtx, text)
// Should either decode successfully with fallback or return original
expect(typeof result).toBe('string')
consoleSpy.mockRestore()
})
})

View File

@ -13,6 +13,8 @@
// limitations under the License.
//
import fs from 'fs/promises'
import path from 'path'
import { Request, Response } from 'express'
import { MeasureContext } from '@hcengineering/core'
import { createMessages } from '@hcengineering/mail-common'
@ -486,4 +488,41 @@ This is an **HTML** test email`
[]
)
})
it('should decode encoded content in email', async () => {
// Create a multipart email with both text and HTML
const base64MessageData = await fs.readFile(path.join(__dirname, '__mocks__/base64Message.json'), 'utf-8')
const mtaMessage: MtaMessage = JSON.parse(base64MessageData)
mockReq = {
headers: { 'x-hook-token': 'test-hook-token' },
body: mtaMessage
}
await handleMtaHook(mockReq as Request, mockRes as Response, mockCtx)
// Should return 200
expect(mockStatus).toHaveBeenCalledWith(200)
expect(mockSend).toHaveBeenCalledWith({ action: 'accept' })
// Should process the message with both content types
expect(createMessages).toHaveBeenCalledWith(
client.baseConfig,
mockCtx,
mockTxOperations,
{},
{},
client.mailServiceToken,
mockLoginInfo,
expect.objectContaining({
mailId: expect.any(String),
from: { email: 'example1@test.com', firstName: 'Example', lastName: 'User1' },
to: [{ email: 'recipient2@example.com', firstName: 'Example', lastName: 'Recipient2' }],
subject: 'This is encoded email subject',
content: 'Test encoded email content',
incoming: true
}),
[]
)
})
})

View File

@ -0,0 +1,130 @@
//
// Copyright © 2025 Hardcore Engineering Inc.
//
// Licensed under the Eclipse Public License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. You may
// obtain a copy of the License at https://www.eclipse.org/legal/epl-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//
// See the License for the specific language governing permissions and
// limitations under the License.
//
import { MeasureContext } from '@hcengineering/core'
import { MtaMessage } from './types'
import { getHeader } from './utils'
export function getDecodedContent (ctx: MeasureContext, mta: MtaMessage): string {
const contentEncoding = getHeader(mta, 'Content-Transfer-Encoding')
return decodeContent(ctx, mta.message.contents, contentEncoding)
}
export function decodeContent (ctx: MeasureContext, content: string, encoding: string | undefined): string {
if (encoding == null || encoding.trim() === '') {
return content
}
const normalizedEncoding = encoding.toLowerCase().trim()
switch (normalizedEncoding) {
case 'base64':
try {
return Buffer.from(content, 'base64').toString('utf-8')
} catch (error: any) {
ctx.warn('Failed to decode base64 content:', { error: error.message })
return content
}
case 'quoted-printable':
return decodeQuotedPrintable(content)
case '7bit':
case '8bit':
case 'binary':
default:
return content
}
}
function decodeQuotedPrintable (content: string): string {
return content
.replace(/=([0-9A-F]{2})/gi, (match, hex) => {
return String.fromCharCode(parseInt(hex, 16))
})
.replace(/=\r?\n/g, '') // Remove soft line breaks
.replace(/=$/gm, '') // Remove trailing = at end of lines
}
export function decodeEncodedWords (ctx: MeasureContext, text: string): string {
// RFC 2047 encoded word pattern: =?charset?encoding?encoded_text?=
const encodedWordPattern = /=\?([^?]+)\?([BQbq])\?([^?]*)\?=/g
return text.replace(encodedWordPattern, (match, charset, encoding, encodedText) => {
try {
const normalizedEncoding = encoding.toLowerCase()
let decodedBytes: Buffer
if (normalizedEncoding === 'b') {
// Base64 encoding
decodedBytes = Buffer.from(encodedText, 'base64')
} else if (normalizedEncoding === 'q') {
// Quoted-printable encoding (with some modifications for encoded words)
const qpDecoded = encodedText
.replace(/_/g, ' ') // Underscores represent spaces in encoded words
.replace(/=([0-9A-F]{2})/gi, (_match: any, hex: string) => {
return String.fromCharCode(parseInt(hex, 16))
})
decodedBytes = Buffer.from(qpDecoded, 'binary')
} else {
// Unknown encoding, return original
return match
}
// Convert to string using the specified charset
const normalizedCharset = normalizeCharset(charset)
return decodedBytes.toString(normalizedCharset)
} catch (error: any) {
ctx.warn('Failed to decode encoded word:', { match, error: error.message })
return match // Return original if decoding fails
}
})
}
function normalizeCharset (charset: string): BufferEncoding {
const normalized = charset.toLowerCase().trim()
// Map common charset aliases to Node.js Buffer encodings
switch (normalized) {
case 'utf-8':
case 'utf8':
return 'utf8'
case 'iso-8859-1':
case 'latin1':
case 'cp1252':
case 'windows-1252':
return 'latin1'
case 'ascii':
case 'us-ascii':
return 'ascii'
case 'utf-16':
case 'utf-16le':
case 'ucs-2':
case 'ucs2':
return 'utf16le'
case 'base64':
return 'base64'
case 'hex':
return 'hex'
// For any unsupported charset, default to utf8
default:
return 'utf8'
}
}

View File

@ -23,6 +23,7 @@ import { mailServiceToken, baseConfig, kvsClient } from './client'
import config from './config'
import { MtaMessage } from './types'
import { getHeader, parseContent } from './utils'
import { decodeEncodedWords } from './decode'
export async function handleMtaHook (req: Request, res: Response, ctx: MeasureContext): Promise<void> {
try {
@ -60,7 +61,7 @@ export async function handleMtaHook (req: Request, res: Response, ctx: MeasureCo
}
}
const subject = getHeader(mta, 'Subject') ?? ''
const subject = decodeEncodedWords(ctx, getHeader(mta, 'Subject') ?? '')
const inReplyTo = getHeader(mta, 'In-Reply-To')
const { content, attachments } = await parseContent(ctx, mta)
@ -132,7 +133,7 @@ function extractContactName (
// Match name part that appears before an email in angle brackets
const nameMatch = fromHeader.match(/^\s*"?([^"<]+?)"?\s*<.+?>/)
const encodedName = nameMatch?.[1].trim() ?? ''
const name = encodedName.length > 0 ? decodeMimeWord(ctx, encodedName) : ''
const name = encodedName.length > 0 ? decodeEncodedWords(ctx, encodedName) : ''
let [firstName, lastName] = name.split(' ')
if (firstName === undefined || firstName.length === 0) {
firstName = email.split('@')[0]
@ -143,28 +144,6 @@ function extractContactName (
return { firstName, lastName }
}
function decodeMimeWord (ctx: MeasureContext, text: string): string {
return text.replace(/=\?([^?]+)\?([BQ])\?([^?]+)\?=/gi, (match, charset, encoding, content) => {
try {
if (encoding.toUpperCase() === 'B') {
// Base64 encoding
const buffer = Buffer.from(content, 'base64')
return buffer.toString(charset as BufferEncoding)
} else if (encoding.toUpperCase() === 'Q') {
// Quoted-printable encoding
const decoded = content
.replace(/_/g, ' ')
.replace(/=([0-9A-F]{2})/gi, (_: any, hex: string) => String.fromCharCode(parseInt(hex, 16)))
return Buffer.from(decoded).toString(charset as BufferEncoding)
}
return match
} catch (error) {
ctx.warn('Failed to decode encoded word', { error })
return match
}
})
}
function stripTags (email: string): string {
const [name, domain] = email.split('@')
const tagStart = name.indexOf('+')

View File

@ -21,6 +21,7 @@ import { type Attachment } from '@hcengineering/mail-common'
import { MtaMessage } from './types'
import config from './config'
import { getDecodedContent } from './decode'
export async function parseContent (
ctx: MeasureContext,
@ -34,10 +35,10 @@ export async function parseContent (
}
if (contentType.toLowerCase().startsWith('text/plain')) {
return { content: mta.message.contents, attachments: [] }
return { content: getDecodedContent(ctx, mta), attachments: [] }
}
const email = await getEmailContent(mta)
const email = await getEmailContent(ctx, mta)
let content = email.text ?? ''
let isMarkdown = false
@ -83,14 +84,14 @@ export async function parseContent (
return { content, attachments }
}
export function convertMtaToEml (mta: MtaMessage): string {
export function convertMtaToEml (ctx: MeasureContext, mta: MtaMessage): string {
return `MIME-Version: 1.0
Date: ${new Date().toUTCString()}
From: ${mta.envelope.from.address}
To: ${mta.envelope.to.map((to) => to.address).join(', ')}
Content-Type: ${getHeader(mta, 'Content-Type') ?? 'text/plain; charset=utf-8'}
${unescapeString(mta.message.contents)}`
${unescapeString(getDecodedContent(ctx, mta))}`
}
function unescapeString (str: string): string {
@ -107,8 +108,8 @@ export function getHeader (mta: MtaMessage, header: string): string | undefined
return mta.message.headers.find((header) => header[0].toLowerCase() === h)?.[1]?.trim()
}
async function getEmailContent (mta: MtaMessage): Promise<ReadedEmlJson> {
const eml = convertMtaToEml(mta)
async function getEmailContent (ctx: MeasureContext, mta: MtaMessage): Promise<ReadedEmlJson> {
const eml = convertMtaToEml(ctx, mta)
const email = await new Promise<ReadedEmlJson>((resolve, reject) => {
readEml(eml, (err, json) => {
if (err !== undefined && err !== null) {
@ -123,7 +124,7 @@ async function getEmailContent (mta: MtaMessage): Promise<ReadedEmlJson> {
if (isEmptyString(email.text) && isEmptyString(email.html)) {
return {
...email,
text: removeContentTypeHeader(mta.message.contents)
text: removeContentTypeHeader(getDecodedContent(ctx, mta))
}
}
return email