All files / src/utils pdfConverter.ts

100% Statements 54/54
100% Branches 14/14
100% Functions 12/12
100% Lines 54/54

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140                          1x           16x   16x 16x 16x 16x   16x 1x     15x       15x 15x     15x 1x     14x 14x           1x 3x 3x 3x 2x 1x   1x     3x 1x   3x                 1x     11x 9x 9x   1x 1x     2x     9x 9x                 1x       11x 9x 9x     9x 18x 18x 16x     14x   18x       9x       9x 18x 14x     9x 18x 4x       9x 2x         7x    
// src/utils/pdfConverter.ts
import * as pdfjsLib from 'pdfjs-dist';
import { logger } from '../services/logger.client';
import type { PDFDocumentProxy, PDFPageProxy, PageViewport } from 'pdfjs-dist';
 
/**
 * Renders a single PDF page to a canvas and returns it as a JPEG File object.
 * @param pdfPage The PDF page object from pdf.js.
 * @param pageNumber The page number (1-based).
 * @param originalFileName The name of the original PDF file.
 * @param scale The scale at which to render the page.
 * @returns A promise that resolves to an image File object.
 */
const renderPageToImageFile = async (
  pdfPage: PDFPageProxy,
  pageNumber: number,
  originalFileName: string,
  scale: number,
): Promise<File> => {
  const viewport = pdfPage.getViewport({ scale });
 
  const canvas = document.createElement('canvas');
  const context = canvas.getContext('2d');
  canvas.height = viewport.height;
  canvas.width = viewport.width;
 
  if (!context) {
    throw new Error('Could not get canvas context');
  }
 
  await pdfPage.render({ canvas, canvasContext: context, viewport: viewport as PageViewport })
    .promise;
 
  // Promisify canvas.toBlob for async/await usage
  const blob = await new Promise<Blob | null>((resolve) => {
    canvas.toBlob(resolve, 'image/jpeg', 0.9);
  });
 
  if (!blob) {
    throw new Error(`Failed to convert page ${pageNumber} of PDF to blob.`);
  }
 
  const newFileName = originalFileName.replace(/\.pdf$/i, `_page_${pageNumber}.jpeg`);
  return new File([blob], newFileName, { type: 'image/jpeg' });
};
 
/**
 * Helper to read file as ArrayBuffer using FileReader (fallback for environments missing file.arrayBuffer)
 */
const readFileAsArrayBuffer = (file: File): Promise<ArrayBuffer> => {
  return new Promise((resolve, reject) => {
    const reader = new FileReader();
    reader.onload = () => {
      if (reader.result instanceof ArrayBuffer) {
        resolve(reader.result);
      } else {
        reject(new Error('FileReader result was not an ArrayBuffer'));
      }
    };
    reader.onerror = () => {
      reject(new Error(`FileReader error: ${reader.error?.message}`));
    };
    reader.readAsArrayBuffer(file);
  });
};
 
/**
 * Fetches a PDF document from a File object.
 * @param pdfFile The PDF file.
 * @returns A promise that resolves to the pdf.js document object.
 */
const getPdfDocument = async (pdfFile: File) => {
  let arrayBuffer: ArrayBuffer;
 
  if (typeof pdfFile.arrayBuffer === 'function') {
    try {
      arrayBuffer = await pdfFile.arrayBuffer();
    } catch (error) {
      logger.warn('pdfFile.arrayBuffer() failed, falling back to FileReader', { error });
      arrayBuffer = await readFileAsArrayBuffer(pdfFile);
    }
  } else {
    arrayBuffer = await readFileAsArrayBuffer(pdfFile);
  }
 
  const pdf: PDFDocumentProxy = await pdfjsLib.getDocument(arrayBuffer).promise;
  return pdf;
};
 
/**
 * Converts all pages of a PDF file into an array of image File objects.
 * @param pdfFile The PDF file to convert.
 * @param onProgress Optional callback to report conversion progress.
 * @returns A promise that resolves to an object containing the array of image files and the total page count.
 */
export const convertPdfToImageFiles = async (
  pdfFile: File,
  onProgress?: (currentPage: number, totalPages: number) => void,
): Promise<{ imageFiles: File[]; pageCount: number }> => {
  const pdf = await getPdfDocument(pdfFile);
  const pageCount = pdf.numPages;
  const scale = 1.5;
 
  // Create an array of promises, one for each page rendering task.
  const pagePromises = Array.from({ length: pageCount }, async (_, i) => {
    const pageNumber = i + 1;
    const page = await pdf.getPage(pageNumber);
    const imageFile = await renderPageToImageFile(page, pageNumber, pdfFile.name, scale);
 
    // Report progress as each page finishes.
    onProgress?.(pageNumber, pageCount);
 
    return imageFile;
  });
 
  // Process all pages in parallel and collect the results.
  const settledResults = await Promise.allSettled(pagePromises);
 
  // Filter for fulfilled promises and extract their values. This allows for partial
  // success if some pages convert and others fail.
  const imageFiles = settledResults
    .filter((result): result is PromiseFulfilledResult<File> => result.status === 'fulfilled')
    .map((result) => result.value);
 
  // Log any pages that failed to convert, without stopping the entire process.
  settledResults.forEach((result) => {
    if (result.status === 'rejected') {
      logger.warn({ error: result.reason }, 'A page failed to convert during PDF processing.');
    }
  });
 
  if (imageFiles.length === 0 && pageCount > 0) {
    throw new Error(
      'PDF conversion resulted in zero images, though the PDF has pages. It might be corrupted or contain non-standard content.',
    );
  }
 
  return { imageFiles, pageCount };
};