feat: support voice & image data as input to event

This commit is contained in:
Raj Sharma 2024-10-08 22:19:46 +05:30
parent 97add456a7
commit b2ad77e5b5
7 changed files with 192 additions and 71 deletions

View File

@ -277,7 +277,9 @@ export class DiscordAdapter implements PlatformAdapter {
isDirectMessage: async () => isDirectMessage: async () =>
discordMessage.channel.type === ChannelType.DM, discordMessage.channel.type === ChannelType.DM,
send: async (messageData) => { send: async (messageData) => {
const sentMessage = await discordMessage.channel.send(messageData); const sentMessage = await (discordMessage.channel as TextChannel).send(
messageData
);
return this.convertSentMessage(sentMessage); return this.convertSentMessage(sentMessage);
}, },
reply: async (messageData) => { reply: async (messageData) => {
@ -309,12 +311,12 @@ export class DiscordAdapter implements PlatformAdapter {
return Promise.all(messages.map((msg) => this.convertMessage(msg))); return Promise.all(messages.map((msg) => this.convertMessage(msg)));
}, },
sendFile: async (fileUrl, fileName) => { sendFile: async (fileUrl, fileName) => {
await discordMessage.channel.send({ await (discordMessage.channel as TextChannel).send({
files: [{ attachment: fileUrl, name: fileName }], files: [{ attachment: fileUrl, name: fileName }],
}); });
}, },
sendTyping: async () => { sendTyping: async () => {
await discordMessage.channel.sendTyping(); await (discordMessage.channel as TextChannel).sendTyping();
}, },
}; };
@ -366,19 +368,21 @@ export class DiscordAdapter implements PlatformAdapter {
return Promise.all(messages.map((msg) => this.convertMessage(msg))); return Promise.all(messages.map((msg) => this.convertMessage(msg)));
}, },
sendFile: async (fileUrl, fileName) => { sendFile: async (fileUrl, fileName) => {
await discordMessage.channel.send({ await (discordMessage.channel as TextChannel).send({
files: [{ attachment: fileUrl, name: fileName }], files: [{ attachment: fileUrl, name: fileName }],
}); });
}, },
sendTyping: async () => { sendTyping: async () => {
await discordMessage.channel.sendTyping(); await (discordMessage.channel as TextChannel).sendTyping();
}, },
reply: async (messageData) => { reply: async (messageData) => {
const sentMessage = await discordMessage.reply(messageData); const sentMessage = await discordMessage.reply(messageData);
return this.convertSentMessage(sentMessage); return this.convertSentMessage(sentMessage);
}, },
send: async (messageData) => { send: async (messageData) => {
const sentMessage = await discordMessage.channel.send(messageData); const sentMessage = await (discordMessage.channel as TextChannel).send(
messageData
);
return this.convertSentMessage(sentMessage); return this.convertSentMessage(sentMessage);
}, },
}; };

View File

@ -1,6 +1,7 @@
import { Elysia, t } from "elysia"; import { Elysia, t } from "elysia";
import { userConfigs } from "../config"; import { userConfigs } from "../config";
import { send_sys_log } from "./log"; import { send_sys_log } from "./log";
import { get_transcription } from "../tools/ask";
// Define the type for the event callback // Define the type for the event callback
type EventCallback = ( type EventCallback = (
@ -187,7 +188,7 @@ export const events = new Elysia()
body = textbody; body = textbody;
} }
} }
// console.log("Event received", body); console.log("Event received", body);
if (id === "ping") { if (id === "ping") {
send_sys_log(`Ping event received: ${JSON.stringify(body)}`); send_sys_log(`Ping event received: ${JSON.stringify(body)}`);

View File

@ -28,6 +28,8 @@ export interface Embed {
export interface MessageData { export interface MessageData {
content?: string; content?: string;
embeds?: Embed[]; embeds?: Embed[];
options?: any;
flags?: any;
file?: file?:
| { | {
url: string; url: string;

View File

@ -270,7 +270,7 @@ async function executeAction(action: Action) {
tools = tools?.length ? tools : undefined; tools = tools?.length ? tools : undefined;
const response = await ask({ const response = await ask({
model: "gpt-4o-mini", model: "gpt-4o",
prompt: `You are an Action Executor. prompt: `You are an Action Executor.
You are called to execute an action based on the provided instruction. You are called to execute an action based on the provided instruction.

View File

@ -2,6 +2,7 @@ import OpenAI from "openai";
import { saveApiUsage } from "../usage"; import { saveApiUsage } from "../usage";
import axios from "axios"; import axios from "axios";
import fs from "fs"; import fs from "fs";
import path from "path";
import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs"; import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs";
import { import {
ChatCompletion, ChatCompletion,
@ -132,13 +133,17 @@ export async function ask({
name, name,
tools, tools,
seed, seed,
json,
image_url,
}: { }: {
model?: string; model?: string;
prompt: string; prompt: string;
message?: string; message?: string;
image_url?: string;
name?: string; name?: string;
tools?: RunnableToolFunctionWithParse<any>[]; tools?: RunnableToolFunctionWithParse<any>[];
seed?: string; seed?: string;
json?: boolean;
}): Promise<ChatCompletion> { }): Promise<ChatCompletion> {
// Initialize OpenAI instances // Initialize OpenAI instances
const openai = new OpenAI({ const openai = new OpenAI({
@ -171,10 +176,24 @@ export async function ask({
...history, ...history,
{ {
role: "user", role: "user",
content: message, content: image_url
? [
{
type: "text",
text: message,
},
{
type: "image_url",
image_url: {
url: image_url,
},
},
]
: message,
name, name,
}, },
]; ];
console.log("got image:", image_url?.slice(0, 20));
} else if (seed && !message) { } else if (seed && !message) {
// If seed is provided but no new message, just retrieve history // If seed is provided but no new message, just retrieve history
const history = getMessageHistory(seed); const history = getMessageHistory(seed);
@ -189,7 +208,20 @@ export async function ask({
// If no seed but message is provided, send system prompt and user message without history // If no seed but message is provided, send system prompt and user message without history
messages.push({ messages.push({
role: "user", role: "user",
content: message, content: image_url
? [
{
type: "text",
text: message,
},
{
type: "image_url",
image_url: {
url: image_url,
},
},
]
: message,
name, name,
}); });
} }
@ -228,6 +260,7 @@ export async function ask({
model, model,
messages, messages,
tools, tools,
response_format: json ? { type: "json_object" } : undefined,
}) })
.on("functionCall", (functionCall) => { .on("functionCall", (functionCall) => {
send_sys_log(`ASK Function call: ${JSON.stringify(functionCall)}`); send_sys_log(`ASK Function call: ${JSON.stringify(functionCall)}`);
@ -293,15 +326,20 @@ export async function ask({
const transcriptionCacheFile = pathInDataDir("transcription_cache.json"); const transcriptionCacheFile = pathInDataDir("transcription_cache.json");
export async function get_transcription( export async function get_transcription(
file_url: string, input: string | File, // Accept either a file URL (string) or a File object
binary?: boolean, binary?: boolean,
key?: string key?: string
) { ) {
// const openai = new OpenAI({
// apiKey: ai_token,
// });
const openai = new OpenAI({ const openai = new OpenAI({
apiKey: ai_token, apiKey: groq_token,
baseURL: groq_baseurl,
}); });
// Step 1: Check if the transcription for this file URL is already cached // Step 1: Check if the transcription for this input (file_url or File) is already cached
let transcriptionCache: Record<string, string> = {}; let transcriptionCache: Record<string, string> = {};
// Try to read the cache file if it exists // Try to read the cache file if it exists
@ -310,56 +348,56 @@ export async function get_transcription(
transcriptionCache = JSON.parse(cacheData); transcriptionCache = JSON.parse(cacheData);
} }
let filePath: string;
let isAudio = false;
let fileExtension: string;
// Determine if the input is a File or URL and handle accordingly
if (input instanceof File) {
// Check the MIME type for audio validation
if (!input.type.startsWith("audio/")) {
throw new Error("The provided file is not an audio file.");
}
isAudio = true;
// Set file extension based on the MIME type
fileExtension = getExtensionFromMimeType(input.type) ?? "ogg";
if (!fileExtension) {
throw new Error(`Unsupported audio file type: ${input.type}`);
}
// Write the file to the filesystem temporarily with the correct extension
filePath = `/tmp/audio${Date.now()}.${fileExtension}`;
const buffer = await input.arrayBuffer();
fs.writeFileSync(filePath, new Uint8Array(buffer));
} else if (typeof input === "string") {
if (binary) { if (binary) {
// If transcription for this file_url is already in the cache, return it // If input is binary data
const binaryData = Buffer.from(input, "base64");
if (key && transcriptionCache[key]) { if (key && transcriptionCache[key]) {
console.log("Transcription found in cache:", transcriptionCache[key]); console.log("Transcription found in cache:", transcriptionCache[key]);
return transcriptionCache[key]; return transcriptionCache[key];
} }
filePath = `/tmp/audio${Date.now()}.ogg`; // Default to .ogg for binary input
const binaryData = Buffer.from(file_url, "base64");
// fs.writeFile("/home/audio_whats.ogg", binaryData, function (err) {});
const filePath = `/tmp/audio${Date.now()}.ogg`;
fs.writeFileSync(filePath, new Uint8Array(binaryData)); fs.writeFileSync(filePath, new Uint8Array(binaryData));
} else {
// Step 3: Send the file to OpenAI's Whisper model // Treat input as a file URL and extract the file extension
const transcription = await openai.audio.transcriptions.create({ fileExtension = path.extname(input).slice(1).toLowerCase();
model: "whisper-1", if (!["mp3", "ogg", "wav", "m4a"].includes(fileExtension)) {
file: fs.createReadStream(filePath), throw new Error(
}); "The provided URL does not point to a valid audio file."
// Delete the temp file
fs.unlinkSync(filePath);
// Step 4: Save the transcription to the cache
key && (transcriptionCache[key] = transcription.text);
fs.writeFileSync(
transcriptionCacheFile,
JSON.stringify(transcriptionCache, null, 2)
); );
console.log("Transcription:", transcription);
return transcription.text;
} }
isAudio = true;
// If transcription for this file_url is already in the cache, return it
if (transcriptionCache[file_url]) {
console.log("Transcription found in cache:", transcriptionCache[file_url]);
return transcriptionCache[file_url];
}
try {
// Step 2: Download the file from the URL // Step 2: Download the file from the URL
const response = await axios({ const response = await axios({
url: file_url, url: input,
method: "GET", method: "GET",
responseType: "stream", responseType: "stream",
}); });
const filePath = `/tmp/audio${Date.now()}.ogg`; filePath = `/tmp/audio${Date.now()}.${fileExtension}`;
// Save the downloaded file locally // Save the downloaded file locally
const writer = fs.createWriteStream(filePath); const writer = fs.createWriteStream(filePath);
@ -369,18 +407,32 @@ export async function get_transcription(
writer.on("finish", resolve); writer.on("finish", resolve);
writer.on("error", reject); writer.on("error", reject);
}); });
}
} else {
throw new Error(
"Invalid input type. Must be either a file URL or a File object."
);
}
// Step 3: Send the file to OpenAI's Whisper model try {
// Step 3: Send the file to OpenAI's Whisper model for transcription
const transcription = await openai.audio.transcriptions.create({ const transcription = await openai.audio.transcriptions.create({
model: "whisper-1", // model: "whisper-1",
model: "distil-whisper-large-v3-en",
file: fs.createReadStream(filePath), file: fs.createReadStream(filePath),
language: "en", // Optional
temperature: 0.0, // Optional
}); });
// Delete the temp file // Delete the temp file
fs.unlinkSync(filePath); fs.unlinkSync(filePath);
// Step 4: Save the transcription to the cache // Step 4: Save the transcription to the cache
transcriptionCache[file_url] = transcription.text; if (key) {
transcriptionCache[key] = transcription.text;
} else if (typeof input === "string") {
transcriptionCache[input] = transcription.text;
}
fs.writeFileSync( fs.writeFileSync(
transcriptionCacheFile, transcriptionCacheFile,
JSON.stringify(transcriptionCache, null, 2) JSON.stringify(transcriptionCache, null, 2)
@ -390,5 +442,20 @@ export async function get_transcription(
return transcription.text; return transcription.text;
} catch (error) { } catch (error) {
console.error("Error transcribing audio:", error); console.error("Error transcribing audio:", error);
throw error;
} }
} }
// Helper function to get file extension based on MIME type
function getExtensionFromMimeType(mimeType: string): string | null {
const mimeTypesMap: Record<string, string> = {
"audio/mpeg": "mp3",
"audio/ogg": "ogg",
"audio/wav": "wav",
"audio/x-wav": "wav",
"audio/x-m4a": "m4a",
"audio/m4a": "m4a",
// Add other audio types as necessary
};
return mimeTypesMap[mimeType] || null;
}

View File

@ -111,6 +111,7 @@ You can use the \`memory_manager\` tool to remember user preferences, such as wh
const response = await ask({ const response = await ask({
prompt, prompt,
model: "gpt-4o",
message: `request: ${request} message: `request: ${request}
prefered_platform: ${prefered_platform} prefered_platform: ${prefered_platform}

View File

@ -9,7 +9,7 @@ import path from "path";
import { discordAdapter } from "../interfaces"; import { discordAdapter } from "../interfaces";
import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs"; import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs";
import { getTools, zodFunction } from "."; import { getTools, zodFunction } from ".";
import { ask } from "./ask"; import { ask, get_transcription } from "./ask";
import { get_actions } from "./actions"; import { get_actions } from "./actions";
import { pathInDataDir, userConfigs } from "../config"; import { pathInDataDir, userConfigs } from "../config";
import { memory_manager_guide, memory_manager_init } from "./memory-manager"; import { memory_manager_guide, memory_manager_init } from "./memory-manager";
@ -398,11 +398,52 @@ function registerListener(listener: EventListener) {
const is_voice = listener.eventId === "on_voice_message"; const is_voice = listener.eventId === "on_voice_message";
let attached_image: string | undefined = undefined;
if (is_voice) { if (is_voice) {
tools = getTools( tools = getTools(
contextMessage.author.username, contextMessage.author.username,
contextMessage contextMessage
) as RunnableToolFunctionWithParse<any>[]; ) as RunnableToolFunctionWithParse<any>[];
const audio = ((payload as any) ?? {}).transcription;
if (audio && audio instanceof File) {
if (audio.type.includes("audio")) {
console.log("Transcribing audio for voice event listener.");
(payload as any).transcription = await get_transcription(
audio as File
);
}
}
const otherContextData = (payload as any)?.other_context_data;
if (otherContextData instanceof File) {
if (otherContextData.type.includes("image")) {
// Read the file as a buffer
const buffer = await otherContextData.arrayBuffer();
// Convert the buffer to a base64 string
const base64Url = `data:${
otherContextData.type
};base64,${Buffer.from(buffer).toString("base64")}`;
// Create the object with base64 URL
const imageObject = {
type: "image_url",
image_url: {
url: base64Url,
},
};
// Do something with imageObject, like sending it in a response or logging
attached_image = base64Url;
} else {
console.log("The provided file is not an image.");
}
} else {
console.log("No valid file provided in other_context_data.");
}
} }
console.log("Running ASK for event listener: ", listener.description); console.log("Running ASK for event listener: ", listener.description);
@ -471,12 +512,12 @@ function registerListener(listener: EventListener) {
- Payload: ${JSON.stringify(payload, null, 2)} - Payload: ${JSON.stringify(payload, null, 2)}
Follow the transcript provided in the payload. Follow the transcript provided in the payload.
Reply only in plain text without markdown or any other formatting.
You response must be in plain text without markdown or any other formatting.
`; `;
if (system_prompts) { if (system_prompts) {
prompt = `${system_prompts.map((p) => p.content).join("\n\n")}`; prompt = `${system_prompts.map((p) => p.content).join("\n\n")}`;
// console.log("Voice system Prompt: ", prompt);
} }
const response = !is_voice const response = !is_voice
@ -486,9 +527,10 @@ function registerListener(listener: EventListener) {
tools, tools,
}) })
: await ask({ : await ask({
model: "gpt-4o-mini", model: attached_image ? "gpt-4o" : "gpt-4o-mini",
prompt, prompt,
message: voice_prompt, message: voice_prompt,
image_url: attached_image,
seed: `voice-anya-${listener.id}-${eventId}`, seed: `voice-anya-${listener.id}-${eventId}`,
tools, tools,
}); });
@ -503,7 +545,11 @@ function registerListener(listener: EventListener) {
} }
// Send a message to the user indicating the event was triggered // Send a message to the user indicating the event was triggered
if (notify) await contextMessage.send({ content }); if (notify)
await contextMessage.send({
content,
flags: is_voice ? [4096] : undefined,
});
else console.log("Silenced Notification: ", content); else console.log("Silenced Notification: ", content);
// Handle auto-stop options // Handle auto-stop options