import Tesseract from 'tesseract.js';

const cleanText = (text) => {
  return text.replace(/\s{2,}|\t|\n/g, ' ').trim();
};

const chunkTextInSentences = (text, maxChunkSize) => {
  const sentences = text.split('.');
  const chunks = [];
  let currentChunk = '';

  for (let i = 0; i < sentences.length; i += 1) {
    const sentence = sentences[i];
    if (currentChunk.length + sentence.length + 1 <= maxChunkSize) {
      currentChunk += sentence.concat('.');
    } else {
      chunks.push(currentChunk);
      currentChunk = sentence.concat('.');
    }
  }

  if (currentChunk) chunks.push(currentChunk);

  return chunks; // Add this line to return the chunks array
};

export const processImageFile = async (file) => {
  try {
    const arrayBuffer = await readFileAsArrayBuffer(file);
    const base64Image = encodeBase64(arrayBuffer);

    const result = await Tesseract.recognize(`data:image/jpeg;base64,${base64Image}`, 'eng');

    const extractedText = result.data.text;
    const cleanedText = cleanText(extractedText);
    const chunks = chunkTextInSentences(cleanedText, 1000);
    console.log({ file_name: file.name, chunks });
    return { success: true, file, chunks };
  } catch (error) {
    console.error('Error processing the image:', error);
    return { success: false, file };
  }
};

function readFileAsArrayBuffer(file) {
  return new Promise((resolve, reject) => {
    const reader = new FileReader();
    reader.onload = (event) => {
      resolve(event.target.result);
    };
    reader.onerror = (event) => {
      reject(event.target.error);
    };
    reader.readAsArrayBuffer(file);
  });
}

function encodeBase64(buffer) {
  let binary = '';
  const bytes = new Uint8Array(buffer);
  const len = bytes.byteLength;
  for (let i = 0; i < len; i += 1) {
    binary += String.fromCharCode(bytes[i]);
  }
  return window.btoa(binary);
}
