Convert PDF Files to Text with OCR

C
Published in: Google Apps Script - PDF

Google Drive can extract text from regular PDF files as well as scanned PDFs though OCR. This wrapper utility for Apps Script that can be used for converting PDF files to Google Documents and it can perform OCR as well. You do need to enable Advanced Drive services from the Google Dashboard.

 /* Credit: https://gist.github.com/mogsdad/e6795e438615d252584f */

  var blob = DriveApp.getFileById(PDF_FILE_ID).getBlob();
  var text = pdfToText(blob, {ocrLanguage: "en"});
  Logger.log(text);

/**
 * Convert pdf file (blob) to a text file on Drive, using built-in OCR.
 * By default, the text file will be placed in the root folder, with the same
 * name as source pdf (but extension 'txt'). Options:
 */

function pdfToText ( pdfFile, options ) {
  // Ensure Advanced Drive Service is enabled
  try {
    Drive.Files.list();
  }
  catch (e) {
    throw new Error( "Enable 'Drive API' in Resources - Advanced Google Services." );
  }


  // Prepare resource object for file creation
  var parents = [];
  var pdfName = pdfFile.getName();

  var resource = {
    title: pdfName,
    mimeType: pdfFile.getContentType(),
    parents: parents
  };

  // Save PDF as GDOC
  resource.title = pdfName.replace(/pdf$/, 'gdoc');
  var insertOpts = {
    ocr: true,
    ocrLanguage: options.ocrLanguage || 'en'
  }

  var gdocFile = Drive.Files.insert(resource, pdfFile, insertOpts);

  // Get text from GDOC
  var gdocDoc = DocumentApp.openById(gdocFile.id);
  var text = gdocDoc.getBody().getText();

  // Save text file, if requested
  resource.title = pdfName.replace(/pdf$/, 'txt');
  resource.mimeType = MimeType.PLAIN_TEXT;

  var textBlob = Utilities.newBlob(text, MimeType.PLAIN_TEXT, resource.title);
  var textFile = Drive.Files.insert(resource, textBlob);

  return text;
}
Published in: Google Apps Script - PDF

Looking for something? Find here!

Meet the Author

Web Geek, Tech Columnist
A
Amit Agarwal

Amit Agarwal is a Google Developer Expert in GSuite and Google Apps Script. He holds an engineering degree in Computer Science (I.I.T.) and is the first professional blogger in India. Read more on Lifehacker and YourStory

Get in touch