Path: blob/master/deep_learning/seq2seq/translation_mt5/translation_utils.py
2593 views
import os1import tarfile2import zipfile3import requests4import subprocess5from tqdm import tqdm6from urllib.parse import urlparse789def download_file(url: str, directory: str):10"""11Download the file at ``url`` to ``directory``.12Extract to the file content ``directory`` if the original file13is a tar, tar.gz or zip file.1415Parameters16----------17url : str18url of the file.1920directory : str21Directory to download the file.22"""23response = requests.get(url, stream=True)24response.raise_for_status()2526content_len = response.headers.get('Content-Length')27total = int(content_len) if content_len is not None else 02829os.makedirs(directory, exist_ok=True)30file_name = get_file_name_from_url(url)31file_path = os.path.join(directory, file_name)3233with tqdm(unit='B', total=total) as pbar, open(file_path, 'wb') as f:34for chunk in response.iter_content(chunk_size=1024):35if chunk:36pbar.update(len(chunk))37f.write(chunk)3839extract_compressed_file(file_path, directory)404142def extract_compressed_file(compressed_file_path: str, directory: str):43"""44Extract a compressed file to ``directory``. Supports zip, tar.gz, tgz,45tar extensions.4647Parameters48----------49compressed_file_path : str5051directory : str52File will to extracted to this directory.53"""54basename = os.path.basename(compressed_file_path)55if 'zip' in basename:56with zipfile.ZipFile(compressed_file_path, "r") as zip_f:57zip_f.extractall(directory)58elif 'tar.gz' in basename or 'tgz' in basename:59with tarfile.open(compressed_file_path) as f:60f.extractall(directory)616263def get_file_name_from_url(url: str) -> str:64"""65Return the file_name from a URL6667Parameters68----------69url : str70URL to extract file_name from7172Returns73-------74file_name : str75"""76parse = urlparse(url)77return os.path.basename(parse.path)78798081def create_translation_data(82source_input_path: str,83target_input_path: str,84output_path: str,85delimiter: str = "\t",86encoding: str = "utf-8"87):88"""89Creates the paired source and target dataset from the separated ones.90e.g. creates `train.tsv` from `train.de` and `train.en`91"""92with open(source_input_path, encoding=encoding) as f_source_in, \93open(target_input_path, encoding=encoding) as f_target_in, \94open(output_path, "w", encoding=encoding) as f_out:9596for source_raw in f_source_in:97source_raw = source_raw.strip()98target_raw = f_target_in.readline().strip()99if source_raw and target_raw:100output_line = source_raw + delimiter + target_raw + "\n"101f_out.write(output_line)102103104