Implementation Code - OCR with Logprobs Experiment

Test code

TSX
#!/usr/bin/env bun
import { GeminiClient } from "./ai/vertex-gemini";
import fs from "fs";
import path from "path";

async function main() {
  const imagePath = Bun.argv[2];
  if (!imagePath) {
    console.error("Please provide an image path");
    process.exit(1);
  }

  /**
   * Supported models:
   * "gemini-1.5-flash-001"
   * "gemini-1.5-flash-002"
   */

  const modelName = "gemini-1.5-flash-002";

  const tableSchema = {
    type: "array",
    items: {
      type: "array",
      items: {
        type: "string",
      },
    },
  };

  const client = new GeminiClient({
    model: modelName,
    generationConfig: {
      temperature: 0,
      responseLogprobs: true,
      logprobs: 5,
      responseMimeType: "application/json",
      responseSchema: tableSchema,
    },
  });

  try {
    // For non-streaming response with logprobs:
    const response = await client.generateContent(
      `Convert the first table in this image ${imagePath} into a 2d array. Feel free to ignore formatting fluff or random things. Keep the headers. The number of columns and rows should match - there are no merged columns.`,
      [{ path: imagePath }]
    );
    console.log("Response:", JSON.stringify(response, null, 2));

    const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
    const outputPath = path.join(
      process.cwd(),
      `/data/${timestamp}-${modelName}-${imagePath.split("/").pop()}.json`
    );

    fs.writeFileSync(outputPath, JSON.stringify(response, null, 2));

    // // For streaming response:
    // console.log("\nStreaming response:");
    // for await (const chunk of client.streamContent(
    //   "Describe this image in detail",
    //   [{ path: imagePath }]
    // )) {
    //   console.log("Chunk:", JSON.stringify(chunk));
    // }
  } catch (error) {
    console.error("Error:", error);
  }
}

main();

Adapter

TSX
import { VertexAI } from "@google-cloud/vertexai";
import { readFileSync } from "fs";

type HarmCategory =
  | "HARM_CATEGORY_HATE_SPEECH"
  | "HARM_CATEGORY_DANGEROUS_CONTENT"
  | "HARM_CATEGORY_SEXUALLY_EXPLICIT"
  | "HARM_CATEGORY_HARASSMENT";

type HarmThreshold =
  | "BLOCK_LOW_AND_ABOVE"
  | "BLOCK_MEDIUM_AND_ABOVE"
  | "BLOCK_ONLY_HIGH"
  | "BLOCK_NONE"
  | "OFF";

interface SafetySetting {
  category: HarmCategory;
  threshold: HarmThreshold;
}

interface GenerationConfig {
  maxOutputTokens?: number;
  temperature?: number;
  topP?: number;
  topK?: number;
  candidateCount?: number;
  stopSequences?: string[];
  seed?: number;
  responseLogprobs?: boolean;
  logprobs?: number;
  responseMimeType?: "application/json" | "text/plain";
  responseSchema?: any;
}

interface GeminiOptions {
  project?: string;
  location?: string;
  model?: string;
  safetySettings?: SafetySetting[];
  generationConfig?: GenerationConfig;
}

interface FileInput {
  path: string;
  mimeType?: string;
}

const DEFAULT_CONFIG: GeminiOptions = {
  project: process.env.GOOGLE_CLOUD_PROJECT,
  location: "us-central1",
  model: "gemini-1.5-flash-002",
  safetySettings: [
    {
      category: "HARM_CATEGORY_HATE_SPEECH",
      threshold: "OFF",
    },
    {
      category: "HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold: "OFF",
    },
    {
      category: "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold: "OFF",
    },
    {
      category: "HARM_CATEGORY_HARASSMENT",
      threshold: "OFF",
    },
  ],
  generationConfig: {
    maxOutputTokens: 8192,
    temperature: 0.2,
    topP: 0.95,
    responseLogprobs: true,
    logprobs: 5,
  },
};

export class GeminiClient {
  private vertexAI: any;
  private model: any;
  private config: GeminiOptions;

  constructor(options: Partial<GeminiOptions> = {}) {
    this.config = {
      ...DEFAULT_CONFIG,
      ...options,
      generationConfig: {
        ...DEFAULT_CONFIG.generationConfig,
        ...options.generationConfig,
      },
      safetySettings: options.safetySettings || DEFAULT_CONFIG.safetySettings,
    };

    if (!this.config.project) {
      throw new Error(
        "Project ID is required. Set GOOGLE_CLOUD_PROJECT env variable or pass in options."
      );
    }

    this.vertexAI = new VertexAI({
      project: this.config.project,
      location: this.config.location,
    });

    this.model = this.vertexAI.preview.getGenerativeModel({
      model: this.config.model,
      generationConfig: this.config.generationConfig,
      safetySettings: this.config.safetySettings,
    });
  }

  /**
   * Convert a file to base64 with proper mime type
   */
  private async processFile(file: FileInput): Promise<{
    inlineData: { mimeType: string; data: string };
  }> {
    const fileData = readFileSync(file.path);
    const base64Data = fileData.toString("base64");

    return {
      inlineData: {
        mimeType: file.mimeType || "image/jpeg", // Default to jpeg if not specified
        data: base64Data,
      },
    };
  }

  /**
   * Generate content with Gemini
   */
  async generateContent(
    prompt: string,
    files: FileInput[] = [],
    streaming: boolean = false
  ) {
    try {
      // Process all files
      const processedFiles = await Promise.all(
        files.map((file) => this.processFile(file))
      );

      // Construct request
      const request = {
        contents: [
          {
            role: "user",
            parts: [...processedFiles, { text: prompt }],
          },
        ],
      };

      if (streaming) {
        return await this.model.generateContentStream(request);
      } else {
        return await this.model.generateContent(request);
      }
    } catch (error) {
      console.error("Error generating content:", error);
      throw error;
    }
  }

  /**
   * Generate content and stream the response
   */
  async *streamContent(prompt: string, files: FileInput[] = []) {
    const streamingResp = await this.generateContent(prompt, files, true);

    for await (const chunk of streamingResp.stream) {
      yield chunk;
    }

    // Return the final aggregated response
    return await streamingResp.response;
  }
}

// Example usage:
/*
const client = new GeminiClient({
  project: 'your-project',
  generationConfig: {
    temperature: 0.4,
    responseLogprobs: true,
    logprobs: 5
  }
});

// Non-streaming usage
const response = await client.generateContent(
  "Describe this image in detail",
  [{ path: "path/to/image.jpg" }]
);

// Streaming usage
for await (const chunk of client.streamContent(
  "Describe this image in detail",
  [{ path: "path/to/image.jpg" }]
)) {
  console.log(chunk);
}
*/