Implementation Code - OCR with Logprobs Experiment

Hrishi Olickel·

Test code

TSX
#!/usr/bin/env bun import { GeminiClient } from "./ai/vertex-gemini"; import fs from "fs"; import path from "path"; async function main() { const imagePath = Bun.argv[2]; if (!imagePath) { console.error("Please provide an image path"); process.exit(1); } /** * Supported models: * "gemini-1.5-flash-001" * "gemini-1.5-flash-002" */ const modelName = "gemini-1.5-flash-002"; const tableSchema = { type: "array", items: { type: "array", items: { type: "string", }, }, }; const client = new GeminiClient({ model: modelName, generationConfig: { temperature: 0, responseLogprobs: true, logprobs: 5, responseMimeType: "application/json", responseSchema: tableSchema, }, }); try { // For non-streaming response with logprobs: const response = await client.generateContent( `Convert the first table in this image ${imagePath} into a 2d array. Feel free to ignore formatting fluff or random things. Keep the headers. The number of columns and rows should match - there are no merged columns.`, [{ path: imagePath }] ); console.log("Response:", JSON.stringify(response, null, 2)); const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); const outputPath = path.join( process.cwd(), `/data/${timestamp}-${modelName}-${imagePath.split("/").pop()}.json` ); fs.writeFileSync(outputPath, JSON.stringify(response, null, 2)); // // For streaming response: // console.log("\nStreaming response:"); // for await (const chunk of client.streamContent( // "Describe this image in detail", // [{ path: imagePath }] // )) { // console.log("Chunk:", JSON.stringify(chunk)); // } } catch (error) { console.error("Error:", error); } } main();

Adapter

TSX
import { VertexAI } from "@google-cloud/vertexai"; import { readFileSync } from "fs"; type HarmCategory = | "HARM_CATEGORY_HATE_SPEECH" | "HARM_CATEGORY_DANGEROUS_CONTENT" | "HARM_CATEGORY_SEXUALLY_EXPLICIT" | "HARM_CATEGORY_HARASSMENT"; type HarmThreshold = | "BLOCK_LOW_AND_ABOVE" | "BLOCK_MEDIUM_AND_ABOVE" | "BLOCK_ONLY_HIGH" | "BLOCK_NONE" | "OFF"; interface SafetySetting { category: HarmCategory; threshold: HarmThreshold; } interface GenerationConfig { maxOutputTokens?: number; temperature?: number; topP?: number; topK?: number; candidateCount?: number; stopSequences?: string[]; seed?: number; responseLogprobs?: boolean; logprobs?: number; responseMimeType?: "application/json" | "text/plain"; responseSchema?: any; } interface GeminiOptions { project?: string; location?: string; model?: string; safetySettings?: SafetySetting[]; generationConfig?: GenerationConfig; } interface FileInput { path: string; mimeType?: string; } const DEFAULT_CONFIG: GeminiOptions = { project: process.env.GOOGLE_CLOUD_PROJECT, location: "us-central1", model: "gemini-1.5-flash-002", safetySettings: [ { category: "HARM_CATEGORY_HATE_SPEECH", threshold: "OFF", }, { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "OFF", }, { category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "OFF", }, { category: "HARM_CATEGORY_HARASSMENT", threshold: "OFF", }, ], generationConfig: { maxOutputTokens: 8192, temperature: 0.2, topP: 0.95, responseLogprobs: true, logprobs: 5, }, }; export class GeminiClient { private vertexAI: any; private model: any; private config: GeminiOptions; constructor(options: Partial<GeminiOptions> = {}) { this.config = { ...DEFAULT_CONFIG, ...options, generationConfig: { ...DEFAULT_CONFIG.generationConfig, ...options.generationConfig, }, safetySettings: options.safetySettings || DEFAULT_CONFIG.safetySettings, }; if (!this.config.project) { throw new Error( "Project ID is required. Set GOOGLE_CLOUD_PROJECT env variable or pass in options." ); } this.vertexAI = new VertexAI({ project: this.config.project, location: this.config.location, }); this.model = this.vertexAI.preview.getGenerativeModel({ model: this.config.model, generationConfig: this.config.generationConfig, safetySettings: this.config.safetySettings, }); } /** * Convert a file to base64 with proper mime type */ private async processFile(file: FileInput): Promise<{ inlineData: { mimeType: string; data: string }; }> { const fileData = readFileSync(file.path); const base64Data = fileData.toString("base64"); return { inlineData: { mimeType: file.mimeType || "image/jpeg", // Default to jpeg if not specified data: base64Data, }, }; } /** * Generate content with Gemini */ async generateContent( prompt: string, files: FileInput[] = [], streaming: boolean = false ) { try { // Process all files const processedFiles = await Promise.all( files.map((file) => this.processFile(file)) ); // Construct request const request = { contents: [ { role: "user", parts: [...processedFiles, { text: prompt }], }, ], }; if (streaming) { return await this.model.generateContentStream(request); } else { return await this.model.generateContent(request); } } catch (error) { console.error("Error generating content:", error); throw error; } } /** * Generate content and stream the response */ async *streamContent(prompt: string, files: FileInput[] = []) { const streamingResp = await this.generateContent(prompt, files, true); for await (const chunk of streamingResp.stream) { yield chunk; } // Return the final aggregated response return await streamingResp.response; } } // Example usage: /* const client = new GeminiClient({ project: 'your-project', generationConfig: { temperature: 0.4, responseLogprobs: true, logprobs: 5 } }); // Non-streaming usage const response = await client.generateContent( "Describe this image in detail", [{ path: "path/to/image.jpg" }] ); // Streaming usage for await (const chunk of client.streamContent( "Describe this image in detail", [{ path: "path/to/image.jpg" }] )) { console.log(chunk); } */