structured-outputs-image.ts 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. import ollama from 'ollama';
  2. import { z } from 'zod';
  3. import { zodToJsonSchema } from 'zod-to-json-schema';
  4. import { readFileSync } from 'fs';
  5. import { resolve } from 'path';
  6. import { createInterface } from 'readline';
  7. /*
  8. Ollama vision capabilities with structured outputs
  9. It takes an image file as input and returns a structured JSON description of the image contents
  10. including detected objects, scene analysis, colors, and any text found in the image
  11. */
  12. // Schema for individual objects detected in the image
  13. const ObjectSchema = z.object({
  14. name: z.string().describe('The name of the object'),
  15. confidence: z.number().min(0).max(1).describe('The confidence score of the object detection'),
  16. attributes: z.record(z.any()).optional().describe('Additional attributes of the object')
  17. });
  18. // Schema for individual objects detected in the image
  19. const ImageDescriptionSchema = z.object({
  20. summary: z.string().describe('A concise summary of the image'),
  21. objects: z.array(ObjectSchema).describe('An array of objects detected in the image'),
  22. scene: z.string().describe('The scene of the image'),
  23. colors: z.array(z.string()).describe('An array of colors detected in the image'),
  24. time_of_day: z.enum(['Morning', 'Afternoon', 'Evening', 'Night']).describe('The time of day the image was taken'),
  25. setting: z.enum(['Indoor', 'Outdoor', 'Unknown']).describe('The setting of the image'),
  26. text_content: z.string().describe('Any text detected in the image')
  27. });
  28. async function run(model: string) {
  29. // Create readline interface for user input
  30. const rl = createInterface({
  31. input: process.stdin,
  32. output: process.stdout
  33. });
  34. // Get path from user input
  35. const path = await new Promise<string>(resolve => {
  36. rl.question('Enter the path to your image: ', resolve);
  37. });
  38. rl.close();
  39. // Verify the file exists and read it
  40. try {
  41. const imagePath = resolve(path);
  42. const imageBuffer = readFileSync(imagePath);
  43. const base64Image = imageBuffer.toString('base64');
  44. // Convert the Zod schema to JSON Schema format
  45. const jsonSchema = zodToJsonSchema(ImageDescriptionSchema);
  46. const messages = [{
  47. role: 'user',
  48. content: 'Analyze this image and return a detailed JSON description including objects, scene, colors and any text detected. If you cannot determine certain details, leave those fields empty.',
  49. images: [base64Image]
  50. }];
  51. const response = await ollama.chat({
  52. model: model,
  53. messages: messages,
  54. format: jsonSchema,
  55. options: {
  56. temperature: 0 // Make responses more deterministic
  57. }
  58. });
  59. // Parse and validate the response
  60. try {
  61. const imageAnalysis = ImageDescriptionSchema.parse(JSON.parse(response.message.content));
  62. console.log('Image Analysis:', imageAnalysis);
  63. } catch (error) {
  64. console.error("Generated invalid response:", error);
  65. }
  66. } catch (error) {
  67. console.error("Error reading image file:", error);
  68. }
  69. }
  70. run('llama3.2-vision').catch(console.error);