1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- import ollama from 'ollama';
- import { z } from 'zod';
- import { zodToJsonSchema } from 'zod-to-json-schema';
- import { readFileSync } from 'fs';
- import { resolve } from 'path';
- import { createInterface } from 'readline';
- /*
- Ollama vision capabilities with structured outputs
- It takes an image file as input and returns a structured JSON description of the image contents
- including detected objects, scene analysis, colors, and any text found in the image
- */
- // Schema for individual objects detected in the image
- const ObjectSchema = z.object({
- name: z.string().describe('The name of the object'),
- confidence: z.number().min(0).max(1).describe('The confidence score of the object detection'),
- attributes: z.record(z.any()).optional().describe('Additional attributes of the object')
- });
- // Schema for individual objects detected in the image
- const ImageDescriptionSchema = z.object({
- summary: z.string().describe('A concise summary of the image'),
- objects: z.array(ObjectSchema).describe('An array of objects detected in the image'),
- scene: z.string().describe('The scene of the image'),
- colors: z.array(z.string()).describe('An array of colors detected in the image'),
- time_of_day: z.enum(['Morning', 'Afternoon', 'Evening', 'Night']).describe('The time of day the image was taken'),
- setting: z.enum(['Indoor', 'Outdoor', 'Unknown']).describe('The setting of the image'),
- text_content: z.string().describe('Any text detected in the image')
- });
- async function run(model: string) {
- // Create readline interface for user input
- const rl = createInterface({
- input: process.stdin,
- output: process.stdout
- });
- // Get path from user input
- const path = await new Promise<string>(resolve => {
- rl.question('Enter the path to your image: ', resolve);
- });
- rl.close();
- // Verify the file exists and read it
- try {
- const imagePath = resolve(path);
- const imageBuffer = readFileSync(imagePath);
- const base64Image = imageBuffer.toString('base64');
- // Convert the Zod schema to JSON Schema format
- const jsonSchema = zodToJsonSchema(ImageDescriptionSchema);
- const messages = [{
- role: 'user',
- content: 'Analyze this image and return a detailed JSON description including objects, scene, colors and any text detected. If you cannot determine certain details, leave those fields empty.',
- images: [base64Image]
- }];
- const response = await ollama.chat({
- model: model,
- messages: messages,
- format: jsonSchema,
- options: {
- temperature: 0 // Make responses more deterministic
- }
- });
- // Parse and validate the response
- try {
- const imageAnalysis = ImageDescriptionSchema.parse(JSON.parse(response.message.content));
- console.log('Image Analysis:', imageAnalysis);
- } catch (error) {
- console.error("Generated invalid response:", error);
- }
- } catch (error) {
- console.error("Error reading image file:", error);
- }
- }
- run('llama3.2-vision').catch(console.error);
|