Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/misc/convert_image_formats.ipynb
1192 views
Kernel: Python 3

Open In Colab

!pip install pdf2image
Collecting pdf2image Downloading pdf2image-1.16.0-py3-none-any.whl (10 kB) Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from pdf2image) (7.1.2) Installing collected packages: pdf2image Successfully installed pdf2image-1.16.0
import pdf2image
!sudo apt-get install poppler-utils
Reading package lists... Done Building dependency tree Reading state information... Done The following package was automatically installed and is no longer required: libnvidia-common-460 Use 'sudo apt autoremove' to remove it. The following NEW packages will be installed: poppler-utils 0 upgraded, 1 newly installed, 0 to remove and 40 not upgraded. Need to get 154 kB of archives. After this operation, 613 kB of additional disk space will be used. Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 poppler-utils amd64 0.62.0-2ubuntu2.12 [154 kB] Fetched 154 kB in 1s (271 kB/s) debconf: unable to initialize frontend: Dialog debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.) debconf: falling back to frontend: Readline debconf: unable to initialize frontend: Readline debconf: (This frontend requires a controlling tty.) debconf: falling back to frontend: Teletype dpkg-preconfigure: unable to re-open stdin: Selecting previously unselected package poppler-utils. (Reading database ... 148489 files and directories currently installed.) Preparing to unpack .../poppler-utils_0.62.0-2ubuntu2.12_amd64.deb ... Unpacking poppler-utils (0.62.0-2ubuntu2.12) ... Setting up poppler-utils (0.62.0-2ubuntu2.12) ... Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
from google.colab import drive drive.mount("/content/drive", force_remount=True)
Mounted at /content/drive
# pip install pdf2image # pip install --upgrade pillow import os import shutil from pdf2image import convert_from_path from PIL import Image from PIL import ImageCms import argparse from glob import glob from tqdm import tqdm import functools import multiprocessing import concurrent.futures def split_file_name(input_path): base_name, dir_name = os.path.basename(input_path), os.path.dirname(input_path) file_name, ext = os.path.splitext(os.path.basename(base_name)) return base_name, dir_name, file_name, ext def convert( input_path, output_path, color_space="CMYK", input_profile_path=None, output_profile_path=None, quality=100, verbose=False, overwrite=False, ): """converts an image or pdf into a color space of choice for CMYK the default output format is JPG Keyword arguments: input_path -- the input path of the file output_path -- the output path for the result to be written. color_space -- the color space to convert to , default value is CMYK input_profile_path -- the path to the input profile output_profile_path -- the path to the output profile """ try: if not overwrite and os.path.exists(output_path): return True if input_path.endswith(".pdf") or input_path.endswith(".PDF"): # _, dir_name, file_name, _ =split_file_name(output_path) _, dir_name, file_name, _ = split_file_name(input_path) temp_file_name = "temp" + file_name temp_file_path = os.path.join(dir_name, temp_file_name) print("converting ", input_path, " to ", temp_file_path) convert_from_path( input_path, output_file=temp_file_path, fmt="png", use_pdftocairo=True, single_file=True, use_cropbox=True, ) temp_file_path += ".png" print("converting ", temp_file_path, " to ", output_path) _convert_profiles( temp_file_path, output_path, color_space=color_space, input_profile_path=input_profile_path, output_profile_path=output_profile_path, quality=quality, ) os.remove(temp_file_path) return True elif ( input_path.endswith(".png") or input_path.endswith(".PNG") or input_path.endswith(".jpg") or input_path.endswith(".JPG") or input_path.endswith(".jpeg") or input_path.endswith(".JPEG") ): return _convert_profiles( input_path, output_path, color_space=color_space, input_profile_path=input_profile_path, output_profile_path=output_profile_path, quality=quality, ) else: print(f"{input_path} is not a valid image file, copying it instead to {output_path}.") shutil.copy(input_path, output_path) return False except Exception as e: if verbose: print(f"Error in file: {input_path}\n", e) return False def _convert_profiles( input_path=None, output_path=None, color_space="CMYK", input_profile_path=None, output_profile_path=None, quality="100", ): try: with Image.open(input_path) as im: img_cmyk = ImageCms.profileToProfile( im, input_profile_path, output_profile_path, renderingIntent=0, outputMode=color_space ) quality = int(quality) img_cmyk.save(output_path, quality=quality) return True except Exception as e: print(e) print(f"cannot convert{input_path}, copying it instead.") shutil.copy(input_path, output_path) return False # from https://pillow.readthedocs.io/en/stable/handbook/tutorial.html?highlight=cmyk#using-the-image-class def check_image_properties(input_path): try: with Image.open(input_path) as im: print(input_path, im.format, f"{im.size}x{im.mode}") except OSError as e: print("error opening the image\n", e)
from glob import glob files = glob("/content/drive/MyDrive/MLAPA/book-images-original/*.*") p = [print(f) for f in files] filenames = [] for f in files: parts = f.split("/") fname = parts[-1] base = fname.split(".")[:-1][0] # filenames.append(base) filenames.append(fname) print(filenames)
/content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemoRVM.pdf /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemologregL2.pdf /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemologregL1.pdf /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemoSVM.pdf ['kernelBinaryClassifDemoRVM.pdf', 'kernelBinaryClassifDemologregL2.pdf', 'kernelBinaryClassifDemologregL1.pdf', 'kernelBinaryClassifDemoSVM.pdf']
in_folder = "/content/drive/MyDrive/MLAPA/book-images-original" for use_rgb in [False]: if use_rgb: out_folder = "/content/drive/MyDrive/MLAPA/book-images-rgb-80" color_space = "RGB" quality = 80 else: out_folder = "/content/drive/MyDrive/MLAPA/book-images-cmyk-100" color_space = "CMYK" quality = 100 rgb_profile = "sRGB Color Space Profile.icm" cmyk_profile = "USWebCoatedSWOP.icc" profile_folder = "/content/drive/MyDrive/MLAPA" input_profile_path = f"{profile_folder}/{rgb_profile}" if color_space == "RGB": output_profile_path = f"{profile_folder}/{rgb_profile}" else: output_profile_path = f"{profile_folder}/{cmyk_profile}" for fname in filenames: base = fname.split(".")[:-1][0] in_name = f"{in_folder}/{fname}" # in_name = f'{in_folder}/{fname}.pdf' out_name = f"{out_folder}/{base}.jpg" print("!converting ", in_name, " to ", out_name) convert( in_name, out_name, color_space=color_space, quality=quality, verbose=True, input_profile_path=input_profile_path, output_profile_path=output_profile_path, )
!converting /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemoRVM.pdf to /content/drive/MyDrive/MLAPA/book-images-cmyk-100/kernelBinaryClassifDemoRVM.jpg converting /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemoRVM.pdf to /content/drive/MyDrive/MLAPA/book-images-original/tempkernelBinaryClassifDemoRVM converting /content/drive/MyDrive/MLAPA/book-images-original/tempkernelBinaryClassifDemoRVM.png to /content/drive/MyDrive/MLAPA/book-images-cmyk-100/kernelBinaryClassifDemoRVM.jpg !converting /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemologregL2.pdf to /content/drive/MyDrive/MLAPA/book-images-cmyk-100/kernelBinaryClassifDemologregL2.jpg converting /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemologregL2.pdf to /content/drive/MyDrive/MLAPA/book-images-original/tempkernelBinaryClassifDemologregL2 converting /content/drive/MyDrive/MLAPA/book-images-original/tempkernelBinaryClassifDemologregL2.png to /content/drive/MyDrive/MLAPA/book-images-cmyk-100/kernelBinaryClassifDemologregL2.jpg !converting /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemologregL1.pdf to /content/drive/MyDrive/MLAPA/book-images-cmyk-100/kernelBinaryClassifDemologregL1.jpg converting /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemologregL1.pdf to /content/drive/MyDrive/MLAPA/book-images-original/tempkernelBinaryClassifDemologregL1 converting /content/drive/MyDrive/MLAPA/book-images-original/tempkernelBinaryClassifDemologregL1.png to /content/drive/MyDrive/MLAPA/book-images-cmyk-100/kernelBinaryClassifDemologregL1.jpg !converting /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemoSVM.pdf to /content/drive/MyDrive/MLAPA/book-images-cmyk-100/kernelBinaryClassifDemoSVM.jpg converting /content/drive/MyDrive/MLAPA/book-images-original/kernelBinaryClassifDemoSVM.pdf to /content/drive/MyDrive/MLAPA/book-images-original/tempkernelBinaryClassifDemoSVM converting /content/drive/MyDrive/MLAPA/book-images-original/tempkernelBinaryClassifDemoSVM.png to /content/drive/MyDrive/MLAPA/book-images-cmyk-100/kernelBinaryClassifDemoSVM.jpg
!ls /content/drive/MyDrive/MLAPA/book-images-original
kernelBinaryClassifDemologregL1.pdf kernelBinaryClassifDemoRVM.pdf kernelBinaryClassifDemologregL2.pdf kernelBinaryClassifDemoSVM.pdf
!ls /content/drive/MyDrive/MLAPA/book-images-rgb-80
ae_mnist_conv_20d_linear.jpg svmCvSurf.jpg ae_mnist_conv_20d_samples.jpg vae_mnist_conv_20d_linear.jpg largeMarginPrinciple2.jpg vae_mnist_conv_20d_samples.jpg separable-conv2d.jpg
!ls /content/drive/MyDrive/MLAPA/book-images-cmyk-100
ae_mnist_conv_20d_linear.jpg largeMarginPrinciple2.jpg ae_mnist_conv_20d_samples.jpg separable-conv2d.jpg kernelBinaryClassifDemologregL1.jpg svmCvSurf.jpg kernelBinaryClassifDemologregL2.jpg vae_mnist_conv_20d_linear.jpg kernelBinaryClassifDemoRVM.jpg vae_mnist_conv_20d_samples.jpg kernelBinaryClassifDemoSVM.jpg