I am trying to use the relatively new Adobe Extract API to harvest text from pdfs. Adobe has sample code in python and I have been tyring to work with it. I am quite the greenhorn with Python, however, and the code has me stumpled. here it is:
import logging
import os.path
from adobe.pdfservices.operation.auth.credentials import Credentials from adobe.pdfservices.operation.client_config import ClientConfig from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType from adobe.pdfservices.operation.execution_context import ExecutionContext from adobe.pdfservices.operation.io.file_ref import FileRef from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
try: # get base path. base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(file))))
# Initial setup, create credentials instance.
credentials = Credentials.service_account_credentials_builder() \
.from_file(base_path + "/pdfservices-api-credentials.json") \
.build()
# Create client config instance with custom time-outs. client_config = ClientConfig.builder().with_connect_timeout(10000).with_read_timeout(40000)\ .build()
# Create an ExecutionContext using credentials and create a new operation instance.
execution_context = ExecutionContext.create(credentials, client_config)
extract_pdf_operation = ExtractPDFOperation.create_new()
# Set operation input from a source file.
source = FileRef.create_from_local_file(base_path + "/resources/extractPdfInput.pdf")
extract_pdf_operation.set_input(source)
# Build ExtractPDF options and set them into the operation
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
.with_element_to_extract(ExtractElementType.TEXT) \
.build()
extract_pdf_operation.set_options(extract_pdf_options)
# Execute the operation.
result: FileRef = extract_pdf_operation.execute(execution_context)
# Save the result to the specified location.
result.save_as(base_path + "/output/ExtractTextInfoFromPDFWithCustomTimeouts.zip")
except (ServiceApiException, ServiceUsageException, SdkException): logging.exception("Exception encountered while executing operation")
This gives me a symphony of errors"
PS C:\Users\stand\PDFExtractAPI> & C:/Users/stand/AppData/Local/Programs/Python/Python311/python.exe c:/Users/stand/PDFExtractAPI/src/extractpdf/extract_txt_from_pdf_with_custom_timeouts.py C:\Users\stand\AppData\Local\Programs\Python\Python311\Lib\site-packages\requests__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.13) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version! warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported " Traceback (most recent call last): File "c:\Users\stand\PDFExtractAPI\src\extractpdf\extract_txt_from_pdf_with_custom_timeouts.py", line 31, in <module> .from_file(base_path + "/pdfservices-api-credentials.json") \ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\stand\AppData\Local\Programs\Python\Python311\Lib\site-packages\adobe\pdfservices\operation\auth\service_account_credentials.py", line 203, in from_file self._private_key = file_utils.read_conf_file_content(private_key_file_path) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\stand\AppData\Local\Programs\Python\Python311\Lib\site-packages\adobe\pdfservices\operation\internal\util\file_utils.py", line 15, in read_conf_file_content with open(get_file_path(file_path)) as file: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ OSError: [Errno 22] Invalid argument: 'c:\Users\stand\PDFExtractAPI\<C:/Users/stan/PDFExtractAPI/pdfservices-api-credentials.json>' PS C:\Users\stand\PDFExtractAPI> "
ANY help would be most appreciated.