Path: blob/master/ invest-robot-contest_TinkoffBotTwitch-main/venv/lib/python3.8/site-packages/pandas/io/stata.py
7813 views
"""1Module contains tools for processing Stata files into DataFrames23The StataReader below was originally written by Joe Presbrey as part of PyDTA.4It has been extended and improved by Skipper Seabold from the Statsmodels5project who also developed the StataWriter and was finally added to pandas in6a once again improved version.78You can find more information on http://presbrey.mit.edu/PyDTA and9https://www.statsmodels.org/devel/10"""11from __future__ import annotations1213from collections import abc14import datetime15from io import BytesIO16import os17import struct18import sys19from typing import (20IO,21TYPE_CHECKING,22Any,23AnyStr,24Hashable,25Sequence,26cast,27)28import warnings2930from dateutil.relativedelta import relativedelta31import numpy as np3233from pandas._libs.lib import infer_dtype34from pandas._libs.writers import max_len_string_array35from pandas._typing import (36CompressionOptions,37FilePath,38ReadBuffer,39StorageOptions,40WriteBuffer,41)42from pandas.util._decorators import (43Appender,44doc,45)4647from pandas.core.dtypes.common import (48ensure_object,49is_categorical_dtype,50is_datetime64_dtype,51is_numeric_dtype,52)5354from pandas import (55Categorical,56DatetimeIndex,57NaT,58Timestamp,59isna,60to_datetime,61to_timedelta,62)63from pandas.core.arrays.boolean import BooleanDtype64from pandas.core.arrays.integer import _IntegerDtype65from pandas.core.frame import DataFrame66from pandas.core.indexes.base import Index67from pandas.core.series import Series68from pandas.core.shared_docs import _shared_docs6970from pandas.io.common import get_handle7172if TYPE_CHECKING:73from typing import Literal7475_version_error = (76"Version of given Stata file is {version}. pandas supports importing "77"versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "78"114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"79"and 119 (Stata 15/16, over 32,767 variables)."80)8182_statafile_processing_params1 = """\83convert_dates : bool, default True84Convert date variables to DataFrame time values.85convert_categoricals : bool, default True86Read value labels and convert columns to Categorical/Factor variables."""8788_statafile_processing_params2 = """\89index_col : str, optional90Column to set as index.91convert_missing : bool, default False92Flag indicating whether to convert missing values to their Stata93representations. If False, missing values are replaced with nan.94If True, columns containing missing values are returned with95object data types and missing values are represented by96StataMissingValue objects.97preserve_dtypes : bool, default True98Preserve Stata datatypes. If False, numeric data are upcast to pandas99default types for foreign data (float64 or int64).100columns : list or None101Columns to retain. Columns will be returned in the given order. None102returns all columns.103order_categoricals : bool, default True104Flag indicating whether converted categorical data are ordered."""105106_chunksize_params = """\107chunksize : int, default None108Return StataReader object for iterations, returns chunks with109given number of lines."""110111_iterator_params = """\112iterator : bool, default False113Return StataReader object."""114115_reader_notes = """\116Notes117-----118Categorical variables read through an iterator may not have the same119categories and dtype. This occurs when a variable stored in a DTA120file is associated to an incomplete set of value labels that only121label a strict subset of the values."""122123_read_stata_doc = f"""124Read Stata file into DataFrame.125126Parameters127----------128filepath_or_buffer : str, path object or file-like object129Any valid string path is acceptable. The string could be a URL. Valid130URL schemes include http, ftp, s3, and file. For file URLs, a host is131expected. A local file could be: ``file://localhost/path/to/table.dta``.132133If you want to pass in a path object, pandas accepts any ``os.PathLike``.134135By file-like object, we refer to objects with a ``read()`` method,136such as a file handle (e.g. via builtin ``open`` function)137or ``StringIO``.138{_statafile_processing_params1}139{_statafile_processing_params2}140{_chunksize_params}141{_iterator_params}142{_shared_docs["decompression_options"]}143{_shared_docs["storage_options"]}144145Returns146-------147DataFrame or StataReader148149See Also150--------151io.stata.StataReader : Low-level reader for Stata data files.152DataFrame.to_stata: Export Stata data files.153154{_reader_notes}155156Examples157--------158159Creating a dummy stata for this example160>>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',161... 'parrot'],162... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP163>>> df.to_stata('animals.dta') # doctest: +SKIP164165Read a Stata dta file:166167>>> df = pd.read_stata('animals.dta') # doctest: +SKIP168169Read a Stata dta file in 10,000 line chunks:170>>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP171>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP172>>> df.to_stata('filename.dta') # doctest: +SKIP173174>>> itr = pd.read_stata('filename.dta', chunksize=10000) # doctest: +SKIP175>>> for chunk in itr:176... # Operate on a single chunk, e.g., chunk.mean()177... pass # doctest: +SKIP178"""179180_read_method_doc = f"""\181Reads observations from Stata file, converting them into a dataframe182183Parameters184----------185nrows : int186Number of lines to read from data file, if None read whole file.187{_statafile_processing_params1}188{_statafile_processing_params2}189190Returns191-------192DataFrame193"""194195_stata_reader_doc = f"""\196Class for reading Stata dta files.197198Parameters199----------200path_or_buf : path (string), buffer or path object201string, path object (pathlib.Path or py._path.local.LocalPath) or object202implementing a binary read() functions.203{_statafile_processing_params1}204{_statafile_processing_params2}205{_chunksize_params}206{_shared_docs["decompression_options"]}207{_shared_docs["storage_options"]}208209{_reader_notes}210"""211212213_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]214215216stata_epoch = datetime.datetime(1960, 1, 1)217218219# TODO: Add typing. As of January 2020 it is not possible to type this function since220# mypy doesn't understand that a Series and an int can be combined using mathematical221# operations. (+, -).222def _stata_elapsed_date_to_datetime_vec(dates, fmt) -> Series:223"""224Convert from SIF to datetime. https://www.stata.com/help.cgi?datetime225226Parameters227----------228dates : Series229The Stata Internal Format date to convert to datetime according to fmt230fmt : str231The format to convert to. Can be, tc, td, tw, tm, tq, th, ty232Returns233234Returns235-------236converted : Series237The converted dates238239Examples240--------241>>> dates = pd.Series([52])242>>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")2430 1961-01-01244dtype: datetime64[ns]245246Notes247-----248datetime/c - tc249milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day250datetime/C - tC - NOT IMPLEMENTED251milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds252date - td253days since 01jan1960 (01jan1960 = 0)254weekly date - tw255weeks since 1960w1256This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.257The datetime value is the start of the week in terms of days in the258year, not ISO calendar weeks.259monthly date - tm260months since 1960m1261quarterly date - tq262quarters since 1960q1263half-yearly date - th264half-years since 1960h1 yearly265date - ty266years since 0000267"""268MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year269MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days270MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days271MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000272MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000273274def convert_year_month_safe(year, month) -> Series:275"""276Convert year and month to datetimes, using pandas vectorized versions277when the date range falls within the range supported by pandas.278Otherwise it falls back to a slower but more robust method279using datetime.280"""281if year.max() < MAX_YEAR and year.min() > MIN_YEAR:282return to_datetime(100 * year + month, format="%Y%m")283else:284index = getattr(year, "index", None)285return Series(286[datetime.datetime(y, m, 1) for y, m in zip(year, month)], index=index287)288289def convert_year_days_safe(year, days) -> Series:290"""291Converts year (e.g. 1999) and days since the start of the year to a292datetime or datetime64 Series293"""294if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:295return to_datetime(year, format="%Y") + to_timedelta(days, unit="d")296else:297index = getattr(year, "index", None)298value = [299datetime.datetime(y, 1, 1) + relativedelta(days=int(d))300for y, d in zip(year, days)301]302return Series(value, index=index)303304def convert_delta_safe(base, deltas, unit) -> Series:305"""306Convert base dates and deltas to datetimes, using pandas vectorized307versions if the deltas satisfy restrictions required to be expressed308as dates in pandas.309"""310index = getattr(deltas, "index", None)311if unit == "d":312if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:313values = [base + relativedelta(days=int(d)) for d in deltas]314return Series(values, index=index)315elif unit == "ms":316if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:317values = [318base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas319]320return Series(values, index=index)321else:322raise ValueError("format not understood")323base = to_datetime(base)324deltas = to_timedelta(deltas, unit=unit)325return base + deltas326327# TODO(non-nano): If/when pandas supports more than datetime64[ns], this328# should be improved to use correct range, e.g. datetime[Y] for yearly329bad_locs = np.isnan(dates)330has_bad_values = False331if bad_locs.any():332has_bad_values = True333data_col = Series(dates)334data_col[bad_locs] = 1.0 # Replace with NaT335dates = dates.astype(np.int64)336337if fmt.startswith(("%tc", "tc")): # Delta ms relative to base338base = stata_epoch339ms = dates340conv_dates = convert_delta_safe(base, ms, "ms")341elif fmt.startswith(("%tC", "tC")):342343warnings.warn("Encountered %tC format. Leaving in Stata Internal Format.")344conv_dates = Series(dates, dtype=object)345if has_bad_values:346conv_dates[bad_locs] = NaT347return conv_dates348# Delta days relative to base349elif fmt.startswith(("%td", "td", "%d", "d")):350base = stata_epoch351days = dates352conv_dates = convert_delta_safe(base, days, "d")353# does not count leap days - 7 days is a week.354# 52nd week may have more than 7 days355elif fmt.startswith(("%tw", "tw")):356year = stata_epoch.year + dates // 52357days = (dates % 52) * 7358conv_dates = convert_year_days_safe(year, days)359elif fmt.startswith(("%tm", "tm")): # Delta months relative to base360year = stata_epoch.year + dates // 12361month = (dates % 12) + 1362conv_dates = convert_year_month_safe(year, month)363elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base364year = stata_epoch.year + dates // 4365quarter_month = (dates % 4) * 3 + 1366conv_dates = convert_year_month_safe(year, quarter_month)367elif fmt.startswith(("%th", "th")): # Delta half-years relative to base368year = stata_epoch.year + dates // 2369month = (dates % 2) * 6 + 1370conv_dates = convert_year_month_safe(year, month)371elif fmt.startswith(("%ty", "ty")): # Years -- not delta372year = dates373first_month = np.ones_like(dates)374conv_dates = convert_year_month_safe(year, first_month)375else:376raise ValueError(f"Date fmt {fmt} not understood")377378if has_bad_values: # Restore NaT for bad values379conv_dates[bad_locs] = NaT380381return conv_dates382383384def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:385"""386Convert from datetime to SIF. https://www.stata.com/help.cgi?datetime387388Parameters389----------390dates : Series391Series or array containing datetime.datetime or datetime64[ns] to392convert to the Stata Internal Format given by fmt393fmt : str394The format to convert to. Can be, tc, td, tw, tm, tq, th, ty395"""396index = dates.index397NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000398US_PER_DAY = NS_PER_DAY / 1000399400def parse_dates_safe(dates, delta=False, year=False, days=False):401d = {}402if is_datetime64_dtype(dates.dtype):403if delta:404time_delta = dates - stata_epoch405d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds406if days or year:407date_index = DatetimeIndex(dates)408d["year"] = date_index._data.year409d["month"] = date_index._data.month410if days:411days_in_ns = dates.view(np.int64) - to_datetime(412d["year"], format="%Y"413).view(np.int64)414d["days"] = days_in_ns // NS_PER_DAY415416elif infer_dtype(dates, skipna=False) == "datetime":417if delta:418delta = dates._values - stata_epoch419420def f(x: datetime.timedelta) -> float:421return US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds422423v = np.vectorize(f)424d["delta"] = v(delta)425if year:426year_month = dates.apply(lambda x: 100 * x.year + x.month)427d["year"] = year_month._values // 100428d["month"] = year_month._values - d["year"] * 100429if days:430431def g(x: datetime.datetime) -> int:432return (x - datetime.datetime(x.year, 1, 1)).days433434v = np.vectorize(g)435d["days"] = v(dates)436else:437raise ValueError(438"Columns containing dates must contain either "439"datetime64, datetime.datetime or null values."440)441442return DataFrame(d, index=index)443444bad_loc = isna(dates)445index = dates.index446if bad_loc.any():447dates = Series(dates)448if is_datetime64_dtype(dates):449dates[bad_loc] = to_datetime(stata_epoch)450else:451dates[bad_loc] = stata_epoch452453if fmt in ["%tc", "tc"]:454d = parse_dates_safe(dates, delta=True)455conv_dates = d.delta / 1000456elif fmt in ["%tC", "tC"]:457warnings.warn("Stata Internal Format tC not supported.")458conv_dates = dates459elif fmt in ["%td", "td"]:460d = parse_dates_safe(dates, delta=True)461conv_dates = d.delta // US_PER_DAY462elif fmt in ["%tw", "tw"]:463d = parse_dates_safe(dates, year=True, days=True)464conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7465elif fmt in ["%tm", "tm"]:466d = parse_dates_safe(dates, year=True)467conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1468elif fmt in ["%tq", "tq"]:469d = parse_dates_safe(dates, year=True)470conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3471elif fmt in ["%th", "th"]:472d = parse_dates_safe(dates, year=True)473conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(int)474elif fmt in ["%ty", "ty"]:475d = parse_dates_safe(dates, year=True)476conv_dates = d.year477else:478raise ValueError(f"Format {fmt} is not a known Stata date format")479480conv_dates = Series(conv_dates, dtype=np.float64)481missing_value = struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]482conv_dates[bad_loc] = missing_value483484return Series(conv_dates, index=index)485486487excessive_string_length_error = """488Fixed width strings in Stata .dta files are limited to 244 (or fewer)489characters. Column '{0}' does not satisfy this restriction. Use the490'version=117' parameter to write the newer (Stata 13 and later) format.491"""492493494class PossiblePrecisionLoss(Warning):495pass496497498precision_loss_doc = """499Column converted from {0} to {1}, and some data are outside of the lossless500conversion range. This may result in a loss of precision in the saved data.501"""502503504class ValueLabelTypeMismatch(Warning):505pass506507508value_label_mismatch_doc = """509Stata value labels (pandas categories) must be strings. Column {0} contains510non-string labels which will be converted to strings. Please check that the511Stata data file created has not lost information due to duplicate labels.512"""513514515class InvalidColumnName(Warning):516pass517518519invalid_name_doc = """520Not all pandas column names were valid Stata variable names.521The following replacements have been made:522523{0}524525If this is not what you expect, please make sure you have Stata-compliant526column names in your DataFrame (strings only, max 32 characters, only527alphanumerics and underscores, no Stata reserved words)528"""529530531class CategoricalConversionWarning(Warning):532pass533534535categorical_conversion_warning = """536One or more series with value labels are not fully labeled. Reading this537dataset with an iterator results in categorical variable with different538categories. This occurs since it is not possible to know all possible values539until the entire dataset has been read. To avoid this warning, you can either540read dataset without an iterator, or manually convert categorical data by541``convert_categoricals`` to False and then accessing the variable labels542through the value_labels method of the reader.543"""544545546def _cast_to_stata_types(data: DataFrame) -> DataFrame:547"""548Checks the dtypes of the columns of a pandas DataFrame for549compatibility with the data types and ranges supported by Stata, and550converts if necessary.551552Parameters553----------554data : DataFrame555The DataFrame to check and convert556557Notes558-----559Numeric columns in Stata must be one of int8, int16, int32, float32 or560float64, with some additional value restrictions. int8 and int16 columns561are checked for violations of the value restrictions and upcast if needed.562int64 data is not usable in Stata, and so it is downcast to int32 whenever563the value are in the int32 range, and sidecast to float64 when larger than564this range. If the int64 values are outside of the range of those565perfectly representable as float64 values, a warning is raised.566567bool columns are cast to int8. uint columns are converted to int of the568same size if there is no loss in precision, otherwise are upcast to a569larger type. uint64 is currently not supported since it is concerted to570object in a DataFrame.571"""572ws = ""573# original, if small, if large574conversion_data = (575(np.bool_, np.int8, np.int8),576(np.uint8, np.int8, np.int16),577(np.uint16, np.int16, np.int32),578(np.uint32, np.int32, np.int64),579(np.uint64, np.int64, np.float64),580)581582float32_max = struct.unpack("<f", b"\xff\xff\xff\x7e")[0]583float64_max = struct.unpack("<d", b"\xff\xff\xff\xff\xff\xff\xdf\x7f")[0]584585for col in data:586# Cast from unsupported types to supported types587is_nullable_int = isinstance(data[col].dtype, (_IntegerDtype, BooleanDtype))588orig = data[col]589# We need to find orig_missing before altering data below590orig_missing = orig.isna()591if is_nullable_int:592missing_loc = data[col].isna()593if missing_loc.any():594# Replace with always safe value595data.loc[missing_loc, col] = 0596# Replace with NumPy-compatible column597data[col] = data[col].astype(data[col].dtype.numpy_dtype)598dtype = data[col].dtype599for c_data in conversion_data:600if dtype == c_data[0]:601# Value of type variable "_IntType" of "iinfo" cannot be "object"602if data[col].max() <= np.iinfo(c_data[1]).max: # type: ignore[type-var]603dtype = c_data[1]604else:605dtype = c_data[2]606if c_data[2] == np.int64: # Warn if necessary607if data[col].max() >= 2**53:608ws = precision_loss_doc.format("uint64", "float64")609610data[col] = data[col].astype(dtype)611612# Check values and upcast if necessary613if dtype == np.int8:614if data[col].max() > 100 or data[col].min() < -127:615data[col] = data[col].astype(np.int16)616elif dtype == np.int16:617if data[col].max() > 32740 or data[col].min() < -32767:618data[col] = data[col].astype(np.int32)619elif dtype == np.int64:620if data[col].max() <= 2147483620 and data[col].min() >= -2147483647:621data[col] = data[col].astype(np.int32)622else:623data[col] = data[col].astype(np.float64)624if data[col].max() >= 2**53 or data[col].min() <= -(2**53):625ws = precision_loss_doc.format("int64", "float64")626elif dtype in (np.float32, np.float64):627value = data[col].max()628if np.isinf(value):629raise ValueError(630f"Column {col} has a maximum value of infinity which is outside "631"the range supported by Stata."632)633if dtype == np.float32 and value > float32_max:634data[col] = data[col].astype(np.float64)635elif dtype == np.float64:636if value > float64_max:637raise ValueError(638f"Column {col} has a maximum value ({value}) outside the range "639f"supported by Stata ({float64_max})"640)641if is_nullable_int:642if orig_missing.any():643# Replace missing by Stata sentinel value644sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name]645data.loc[orig_missing, col] = sentinel646if ws:647warnings.warn(ws, PossiblePrecisionLoss)648649return data650651652class StataValueLabel:653"""654Parse a categorical column and prepare formatted output655656Parameters657----------658catarray : Series659Categorical Series to encode660encoding : {"latin-1", "utf-8"}661Encoding to use for value labels.662"""663664def __init__(self, catarray: Series, encoding: str = "latin-1"):665666if encoding not in ("latin-1", "utf-8"):667raise ValueError("Only latin-1 and utf-8 are supported.")668self.labname = catarray.name669self._encoding = encoding670categories = catarray.cat.categories671self.value_labels: list[tuple[int | float, str]] = list(672zip(np.arange(len(categories)), categories)673)674self.value_labels.sort(key=lambda x: x[0])675676self._prepare_value_labels()677678def _prepare_value_labels(self):679"""Encode value labels."""680681self.text_len = 0682self.txt: list[bytes] = []683self.n = 0684# Offsets (length of categories), converted to int32685self.off = np.array([], dtype=np.int32)686# Values, converted to int32687self.val = np.array([], dtype=np.int32)688self.len = 0689690# Compute lengths and setup lists of offsets and labels691offsets: list[int] = []692values: list[int | float] = []693for vl in self.value_labels:694category: str | bytes = vl[1]695if not isinstance(category, str):696category = str(category)697warnings.warn(698value_label_mismatch_doc.format(self.labname),699ValueLabelTypeMismatch,700)701category = category.encode(self._encoding)702offsets.append(self.text_len)703self.text_len += len(category) + 1 # +1 for the padding704values.append(vl[0])705self.txt.append(category)706self.n += 1707708if self.text_len > 32000:709raise ValueError(710"Stata value labels for a single variable must "711"have a combined length less than 32,000 characters."712)713714# Ensure int32715self.off = np.array(offsets, dtype=np.int32)716self.val = np.array(values, dtype=np.int32)717718# Total length719self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len720721def generate_value_label(self, byteorder: str) -> bytes:722"""723Generate the binary representation of the value labels.724725Parameters726----------727byteorder : str728Byte order of the output729730Returns731-------732value_label : bytes733Bytes containing the formatted value label734"""735encoding = self._encoding736bio = BytesIO()737null_byte = b"\x00"738739# len740bio.write(struct.pack(byteorder + "i", self.len))741742# labname743labname = str(self.labname)[:32].encode(encoding)744lab_len = 32 if encoding not in ("utf-8", "utf8") else 128745labname = _pad_bytes(labname, lab_len + 1)746bio.write(labname)747748# padding - 3 bytes749for i in range(3):750bio.write(struct.pack("c", null_byte))751752# value_label_table753# n - int32754bio.write(struct.pack(byteorder + "i", self.n))755756# textlen - int32757bio.write(struct.pack(byteorder + "i", self.text_len))758759# off - int32 array (n elements)760for offset in self.off:761bio.write(struct.pack(byteorder + "i", offset))762763# val - int32 array (n elements)764for value in self.val:765bio.write(struct.pack(byteorder + "i", value))766767# txt - Text labels, null terminated768for text in self.txt:769bio.write(text + null_byte)770771return bio.getvalue()772773774class StataNonCatValueLabel(StataValueLabel):775"""776Prepare formatted version of value labels777778Parameters779----------780labname : str781Value label name782value_labels: Dictionary783Mapping of values to labels784encoding : {"latin-1", "utf-8"}785Encoding to use for value labels.786"""787788def __init__(789self,790labname: str,791value_labels: dict[float | int, str],792encoding: Literal["latin-1", "utf-8"] = "latin-1",793):794795if encoding not in ("latin-1", "utf-8"):796raise ValueError("Only latin-1 and utf-8 are supported.")797798self.labname = labname799self._encoding = encoding800self.value_labels: list[tuple[int | float, str]] = sorted(801value_labels.items(), key=lambda x: x[0]802)803self._prepare_value_labels()804805806class StataMissingValue:807"""808An observation's missing value.809810Parameters811----------812value : {int, float}813The Stata missing value code814815Notes816-----817More information: <https://www.stata.com/help.cgi?missing>818819Integer missing values make the code '.', '.a', ..., '.z' to the ranges820101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ...8212147483647 (for int32). Missing values for floating point data types are822more complex but the pattern is simple to discern from the following table.823824np.float32 missing values (float in Stata)8250000007f .8260008007f .a8270010007f .b828...82900c0007f .x83000c8007f .y83100d0007f .z832833np.float64 missing values (double in Stata)834000000000000e07f .835000000000001e07f .a836000000000002e07f .b837...838000000000018e07f .x839000000000019e07f .y84000000000001ae07f .z841"""842843# Construct a dictionary of missing values844MISSING_VALUES: dict[float, str] = {}845bases = (101, 32741, 2147483621)846for b in bases:847# Conversion to long to avoid hash issues on 32 bit platforms #8968848MISSING_VALUES[b] = "."849for i in range(1, 27):850MISSING_VALUES[i + b] = "." + chr(96 + i)851852float32_base = b"\x00\x00\x00\x7f"853increment = struct.unpack("<i", b"\x00\x08\x00\x00")[0]854for i in range(27):855key = struct.unpack("<f", float32_base)[0]856MISSING_VALUES[key] = "."857if i > 0:858MISSING_VALUES[key] += chr(96 + i)859int_value = struct.unpack("<i", struct.pack("<f", key))[0] + increment860float32_base = struct.pack("<i", int_value)861862float64_base = b"\x00\x00\x00\x00\x00\x00\xe0\x7f"863increment = struct.unpack("q", b"\x00\x00\x00\x00\x00\x01\x00\x00")[0]864for i in range(27):865key = struct.unpack("<d", float64_base)[0]866MISSING_VALUES[key] = "."867if i > 0:868MISSING_VALUES[key] += chr(96 + i)869int_value = struct.unpack("q", struct.pack("<d", key))[0] + increment870float64_base = struct.pack("q", int_value)871872BASE_MISSING_VALUES = {873"int8": 101,874"int16": 32741,875"int32": 2147483621,876"float32": struct.unpack("<f", float32_base)[0],877"float64": struct.unpack("<d", float64_base)[0],878}879880def __init__(self, value: int | float):881self._value = value882# Conversion to int to avoid hash issues on 32 bit platforms #8968883value = int(value) if value < 2147483648 else float(value)884self._str = self.MISSING_VALUES[value]885886@property887def string(self) -> str:888"""889The Stata representation of the missing value: '.', '.a'..'.z'890891Returns892-------893str894The representation of the missing value.895"""896return self._str897898@property899def value(self) -> int | float:900"""901The binary representation of the missing value.902903Returns904-------905{int, float}906The binary representation of the missing value.907"""908return self._value909910def __str__(self) -> str:911return self.string912913def __repr__(self) -> str:914return f"{type(self)}({self})"915916def __eq__(self, other: Any) -> bool:917return (918isinstance(other, type(self))919and self.string == other.string920and self.value == other.value921)922923@classmethod924def get_base_missing_value(cls, dtype: np.dtype) -> int | float:925if dtype.type is np.int8:926value = cls.BASE_MISSING_VALUES["int8"]927elif dtype.type is np.int16:928value = cls.BASE_MISSING_VALUES["int16"]929elif dtype.type is np.int32:930value = cls.BASE_MISSING_VALUES["int32"]931elif dtype.type is np.float32:932value = cls.BASE_MISSING_VALUES["float32"]933elif dtype.type is np.float64:934value = cls.BASE_MISSING_VALUES["float64"]935else:936raise ValueError("Unsupported dtype")937return value938939940class StataParser:941def __init__(self):942943# type code.944# --------------------945# str1 1 = 0x01946# str2 2 = 0x02947# ...948# str244 244 = 0xf4949# byte 251 = 0xfb (sic)950# int 252 = 0xfc951# long 253 = 0xfd952# float 254 = 0xfe953# double 255 = 0xff954# --------------------955# NOTE: the byte type seems to be reserved for categorical variables956# with a label, but the underlying variable is -127 to 100957# we're going to drop the label and cast to int958self.DTYPE_MAP = dict(959list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))960+ [961(251, np.dtype(np.int8)),962(252, np.dtype(np.int16)),963(253, np.dtype(np.int32)),964(254, np.dtype(np.float32)),965(255, np.dtype(np.float64)),966]967)968self.DTYPE_MAP_XML = {96932768: np.dtype(np.uint8), # Keys to GSO97065526: np.dtype(np.float64),97165527: np.dtype(np.float32),97265528: np.dtype(np.int32),97365529: np.dtype(np.int16),97465530: np.dtype(np.int8),975}976# error: Argument 1 to "list" has incompatible type "str";977# expected "Iterable[int]" [arg-type]978self.TYPE_MAP = list(range(251)) + list("bhlfd") # type: ignore[arg-type]979self.TYPE_MAP_XML = {980# Not really a Q, unclear how to handle byteswap98132768: "Q",98265526: "d",98365527: "f",98465528: "l",98565529: "h",98665530: "b",987}988# NOTE: technically, some of these are wrong. there are more numbers989# that can be represented. it's the 27 ABOVE and BELOW the max listed990# numeric data type in [U] 12.2.2 of the 11.2 manual991float32_min = b"\xff\xff\xff\xfe"992float32_max = b"\xff\xff\xff\x7e"993float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff"994float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f"995self.VALID_RANGE = {996"b": (-127, 100),997"h": (-32767, 32740),998"l": (-2147483647, 2147483620),999"f": (1000np.float32(struct.unpack("<f", float32_min)[0]),1001np.float32(struct.unpack("<f", float32_max)[0]),1002),1003"d": (1004np.float64(struct.unpack("<d", float64_min)[0]),1005np.float64(struct.unpack("<d", float64_max)[0]),1006),1007}10081009self.OLD_TYPE_MAPPING = {101098: 251, # byte1011105: 252, # int1012108: 253, # long1013102: 254, # float1014100: 255, # double1015}10161017# These missing values are the generic '.' in Stata, and are used1018# to replace nans1019self.MISSING_VALUES = {1020"b": 101,1021"h": 32741,1022"l": 2147483621,1023"f": np.float32(struct.unpack("<f", b"\x00\x00\x00\x7f")[0]),1024"d": np.float64(1025struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]1026),1027}1028self.NUMPY_TYPE_MAP = {1029"b": "i1",1030"h": "i2",1031"l": "i4",1032"f": "f4",1033"d": "f8",1034"Q": "u8",1035}10361037# Reserved words cannot be used as variable names1038self.RESERVED_WORDS = (1039"aggregate",1040"array",1041"boolean",1042"break",1043"byte",1044"case",1045"catch",1046"class",1047"colvector",1048"complex",1049"const",1050"continue",1051"default",1052"delegate",1053"delete",1054"do",1055"double",1056"else",1057"eltypedef",1058"end",1059"enum",1060"explicit",1061"export",1062"external",1063"float",1064"for",1065"friend",1066"function",1067"global",1068"goto",1069"if",1070"inline",1071"int",1072"local",1073"long",1074"NULL",1075"pragma",1076"protected",1077"quad",1078"rowvector",1079"short",1080"typedef",1081"typename",1082"virtual",1083"_all",1084"_N",1085"_skip",1086"_b",1087"_pi",1088"str#",1089"in",1090"_pred",1091"strL",1092"_coef",1093"_rc",1094"using",1095"_cons",1096"_se",1097"with",1098"_n",1099)110011011102class StataReader(StataParser, abc.Iterator):1103__doc__ = _stata_reader_doc11041105def __init__(1106self,1107path_or_buf: FilePath | ReadBuffer[bytes],1108convert_dates: bool = True,1109convert_categoricals: bool = True,1110index_col: str | None = None,1111convert_missing: bool = False,1112preserve_dtypes: bool = True,1113columns: Sequence[str] | None = None,1114order_categoricals: bool = True,1115chunksize: int | None = None,1116compression: CompressionOptions = "infer",1117storage_options: StorageOptions = None,1118):1119super().__init__()1120self.col_sizes: list[int] = []11211122# Arguments to the reader (can be temporarily overridden in1123# calls to read).1124self._convert_dates = convert_dates1125self._convert_categoricals = convert_categoricals1126self._index_col = index_col1127self._convert_missing = convert_missing1128self._preserve_dtypes = preserve_dtypes1129self._columns = columns1130self._order_categoricals = order_categoricals1131self._encoding = ""1132self._chunksize = chunksize1133self._using_iterator = False1134if self._chunksize is None:1135self._chunksize = 11136elif not isinstance(chunksize, int) or chunksize <= 0:1137raise ValueError("chunksize must be a positive integer when set.")11381139# State variables for the file1140self._has_string_data = False1141self._missing_values = False1142self._can_read_value_labels = False1143self._column_selector_set = False1144self._value_labels_read = False1145self._data_read = False1146self._dtype: np.dtype | None = None1147self._lines_read = 011481149self._native_byteorder = _set_endianness(sys.byteorder)1150with get_handle(1151path_or_buf,1152"rb",1153storage_options=storage_options,1154is_text=False,1155compression=compression,1156) as handles:1157# Copy to BytesIO, and ensure no encoding1158self.path_or_buf = BytesIO(handles.handle.read())11591160self._read_header()1161self._setup_dtype()11621163def __enter__(self) -> StataReader:1164"""enter context manager"""1165return self11661167def __exit__(self, exc_type, exc_value, traceback) -> None:1168"""exit context manager"""1169self.close()11701171def close(self) -> None:1172"""close the handle if its open"""1173self.path_or_buf.close()11741175def _set_encoding(self) -> None:1176"""1177Set string encoding which depends on file version1178"""1179if self.format_version < 118:1180self._encoding = "latin-1"1181else:1182self._encoding = "utf-8"11831184def _read_header(self) -> None:1185first_char = self.path_or_buf.read(1)1186if struct.unpack("c", first_char)[0] == b"<":1187self._read_new_header()1188else:1189self._read_old_header(first_char)11901191self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 011921193# calculate size of a data record1194self.col_sizes = [self._calcsize(typ) for typ in self.typlist]11951196def _read_new_header(self) -> None:1197# The first part of the header is common to 117 - 119.1198self.path_or_buf.read(27) # stata_dta><header><release>1199self.format_version = int(self.path_or_buf.read(3))1200if self.format_version not in [117, 118, 119]:1201raise ValueError(_version_error.format(version=self.format_version))1202self._set_encoding()1203self.path_or_buf.read(21) # </release><byteorder>1204self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<"1205self.path_or_buf.read(15) # </byteorder><K>1206nvar_type = "H" if self.format_version <= 118 else "I"1207nvar_size = 2 if self.format_version <= 118 else 41208self.nvar = struct.unpack(1209self.byteorder + nvar_type, self.path_or_buf.read(nvar_size)1210)[0]1211self.path_or_buf.read(7) # </K><N>12121213self.nobs = self._get_nobs()1214self.path_or_buf.read(11) # </N><label>1215self._data_label = self._get_data_label()1216self.path_or_buf.read(19) # </label><timestamp>1217self.time_stamp = self._get_time_stamp()1218self.path_or_buf.read(26) # </timestamp></header><map>1219self.path_or_buf.read(8) # 0x00000000000000001220self.path_or_buf.read(8) # position of <map>12211222self._seek_vartypes = (1223struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 161224)1225self._seek_varnames = (1226struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 101227)1228self._seek_sortlist = (1229struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 101230)1231self._seek_formats = (1232struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 91233)1234self._seek_value_label_names = (1235struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 191236)12371238# Requires version-specific treatment1239self._seek_variable_labels = self._get_seek_variable_labels()12401241self.path_or_buf.read(8) # <characteristics>1242self.data_location = (1243struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 61244)1245self.seek_strls = (1246struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 71247)1248self.seek_value_labels = (1249struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 141250)12511252self.typlist, self.dtyplist = self._get_dtypes(self._seek_vartypes)12531254self.path_or_buf.seek(self._seek_varnames)1255self.varlist = self._get_varlist()12561257self.path_or_buf.seek(self._seek_sortlist)1258self.srtlist = struct.unpack(1259self.byteorder + ("h" * (self.nvar + 1)),1260self.path_or_buf.read(2 * (self.nvar + 1)),1261)[:-1]12621263self.path_or_buf.seek(self._seek_formats)1264self.fmtlist = self._get_fmtlist()12651266self.path_or_buf.seek(self._seek_value_label_names)1267self.lbllist = self._get_lbllist()12681269self.path_or_buf.seek(self._seek_variable_labels)1270self._variable_labels = self._get_variable_labels()12711272# Get data type information, works for versions 117-119.1273def _get_dtypes(1274self, seek_vartypes: int1275) -> tuple[list[int | str], list[str | np.dtype]]:12761277self.path_or_buf.seek(seek_vartypes)1278raw_typlist = [1279struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0]1280for _ in range(self.nvar)1281]12821283def f(typ: int) -> int | str:1284if typ <= 2045:1285return typ1286try:1287return self.TYPE_MAP_XML[typ]1288except KeyError as err:1289raise ValueError(f"cannot convert stata types [{typ}]") from err12901291typlist = [f(x) for x in raw_typlist]12921293def g(typ: int) -> str | np.dtype:1294if typ <= 2045:1295return str(typ)1296try:1297# error: Incompatible return value type (got "Type[number]", expected1298# "Union[str, dtype]")1299return self.DTYPE_MAP_XML[typ] # type: ignore[return-value]1300except KeyError as err:1301raise ValueError(f"cannot convert stata dtype [{typ}]") from err13021303dtyplist = [g(x) for x in raw_typlist]13041305return typlist, dtyplist13061307def _get_varlist(self) -> list[str]:1308# 33 in order formats, 129 in formats 118 and 1191309b = 33 if self.format_version < 118 else 1291310return [self._decode(self.path_or_buf.read(b)) for _ in range(self.nvar)]13111312# Returns the format list1313def _get_fmtlist(self) -> list[str]:1314if self.format_version >= 118:1315b = 571316elif self.format_version > 113:1317b = 491318elif self.format_version > 104:1319b = 121320else:1321b = 713221323return [self._decode(self.path_or_buf.read(b)) for _ in range(self.nvar)]13241325# Returns the label list1326def _get_lbllist(self) -> list[str]:1327if self.format_version >= 118:1328b = 1291329elif self.format_version > 108:1330b = 331331else:1332b = 91333return [self._decode(self.path_or_buf.read(b)) for _ in range(self.nvar)]13341335def _get_variable_labels(self) -> list[str]:1336if self.format_version >= 118:1337vlblist = [1338self._decode(self.path_or_buf.read(321)) for _ in range(self.nvar)1339]1340elif self.format_version > 105:1341vlblist = [1342self._decode(self.path_or_buf.read(81)) for _ in range(self.nvar)1343]1344else:1345vlblist = [1346self._decode(self.path_or_buf.read(32)) for _ in range(self.nvar)1347]1348return vlblist13491350def _get_nobs(self) -> int:1351if self.format_version >= 118:1352return struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0]1353else:1354return struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]13551356def _get_data_label(self) -> str:1357if self.format_version >= 118:1358strlen = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0]1359return self._decode(self.path_or_buf.read(strlen))1360elif self.format_version == 117:1361strlen = struct.unpack("b", self.path_or_buf.read(1))[0]1362return self._decode(self.path_or_buf.read(strlen))1363elif self.format_version > 105:1364return self._decode(self.path_or_buf.read(81))1365else:1366return self._decode(self.path_or_buf.read(32))13671368def _get_time_stamp(self) -> str:1369if self.format_version >= 118:1370strlen = struct.unpack("b", self.path_or_buf.read(1))[0]1371return self.path_or_buf.read(strlen).decode("utf-8")1372elif self.format_version == 117:1373strlen = struct.unpack("b", self.path_or_buf.read(1))[0]1374return self._decode(self.path_or_buf.read(strlen))1375elif self.format_version > 104:1376return self._decode(self.path_or_buf.read(18))1377else:1378raise ValueError()13791380def _get_seek_variable_labels(self) -> int:1381if self.format_version == 117:1382self.path_or_buf.read(8) # <variable_labels>, throw away1383# Stata 117 data files do not follow the described format. This is1384# a work around that uses the previous label, 33 bytes for each1385# variable, 20 for the closing tag and 17 for the opening tag1386return self._seek_value_label_names + (33 * self.nvar) + 20 + 171387elif self.format_version >= 118:1388return struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 171389else:1390raise ValueError()13911392def _read_old_header(self, first_char: bytes) -> None:1393self.format_version = struct.unpack("b", first_char)[0]1394if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:1395raise ValueError(_version_error.format(version=self.format_version))1396self._set_encoding()1397self.byteorder = (1398struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<"1399)1400self.filetype = struct.unpack("b", self.path_or_buf.read(1))[0]1401self.path_or_buf.read(1) # unused14021403self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0]1404self.nobs = self._get_nobs()14051406self._data_label = self._get_data_label()14071408self.time_stamp = self._get_time_stamp()14091410# descriptors1411if self.format_version > 108:1412typlist = [ord(self.path_or_buf.read(1)) for _ in range(self.nvar)]1413else:1414buf = self.path_or_buf.read(self.nvar)1415typlistb = np.frombuffer(buf, dtype=np.uint8)1416typlist = []1417for tp in typlistb:1418if tp in self.OLD_TYPE_MAPPING:1419typlist.append(self.OLD_TYPE_MAPPING[tp])1420else:1421typlist.append(tp - 127) # bytes14221423try:1424self.typlist = [self.TYPE_MAP[typ] for typ in typlist]1425except ValueError as err:1426invalid_types = ",".join([str(x) for x in typlist])1427raise ValueError(f"cannot convert stata types [{invalid_types}]") from err1428try:1429self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]1430except ValueError as err:1431invalid_dtypes = ",".join([str(x) for x in typlist])1432raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") from err14331434if self.format_version > 108:1435self.varlist = [1436self._decode(self.path_or_buf.read(33)) for _ in range(self.nvar)1437]1438else:1439self.varlist = [1440self._decode(self.path_or_buf.read(9)) for _ in range(self.nvar)1441]1442self.srtlist = struct.unpack(1443self.byteorder + ("h" * (self.nvar + 1)),1444self.path_or_buf.read(2 * (self.nvar + 1)),1445)[:-1]14461447self.fmtlist = self._get_fmtlist()14481449self.lbllist = self._get_lbllist()14501451self._variable_labels = self._get_variable_labels()14521453# ignore expansion fields (Format 105 and later)1454# When reading, read five bytes; the last four bytes now tell you1455# the size of the next read, which you discard. You then continue1456# like this until you read 5 bytes of zeros.14571458if self.format_version > 104:1459while True:1460data_type = struct.unpack(1461self.byteorder + "b", self.path_or_buf.read(1)1462)[0]1463if self.format_version > 108:1464data_len = struct.unpack(1465self.byteorder + "i", self.path_or_buf.read(4)1466)[0]1467else:1468data_len = struct.unpack(1469self.byteorder + "h", self.path_or_buf.read(2)1470)[0]1471if data_type == 0:1472break1473self.path_or_buf.read(data_len)14741475# necessary data to continue parsing1476self.data_location = self.path_or_buf.tell()14771478def _setup_dtype(self) -> np.dtype:1479"""Map between numpy and state dtypes"""1480if self._dtype is not None:1481return self._dtype14821483dtypes = [] # Convert struct data types to numpy data type1484for i, typ in enumerate(self.typlist):1485if typ in self.NUMPY_TYPE_MAP:1486typ = cast(str, typ) # only strs in NUMPY_TYPE_MAP1487dtypes.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ]))1488else:1489dtypes.append(("s" + str(i), "S" + str(typ)))1490self._dtype = np.dtype(dtypes)14911492return self._dtype14931494def _calcsize(self, fmt: int | str) -> int:1495if isinstance(fmt, int):1496return fmt1497return struct.calcsize(self.byteorder + fmt)14981499def _decode(self, s: bytes) -> str:1500# have bytes not strings, so must decode1501s = s.partition(b"\0")[0]1502try:1503return s.decode(self._encoding)1504except UnicodeDecodeError:1505# GH 25960, fallback to handle incorrect format produced when 1171506# files are converted to 118 files in Stata1507encoding = self._encoding1508msg = f"""1509One or more strings in the dta file could not be decoded using {encoding}, and1510so the fallback encoding of latin-1 is being used. This can happen when a file1511has been incorrectly encoded by Stata or some other software. You should verify1512the string values returned are correct."""1513warnings.warn(msg, UnicodeWarning)1514return s.decode("latin-1")15151516def _read_value_labels(self) -> None:1517if self._value_labels_read:1518# Don't read twice1519return1520if self.format_version <= 108:1521# Value labels are not supported in version 108 and earlier.1522self._value_labels_read = True1523self.value_label_dict: dict[str, dict[float | int, str]] = {}1524return15251526if self.format_version >= 117:1527self.path_or_buf.seek(self.seek_value_labels)1528else:1529assert self._dtype is not None1530offset = self.nobs * self._dtype.itemsize1531self.path_or_buf.seek(self.data_location + offset)15321533self._value_labels_read = True1534self.value_label_dict = {}15351536while True:1537if self.format_version >= 117:1538if self.path_or_buf.read(5) == b"</val": # <lbl>1539break # end of value label table15401541slength = self.path_or_buf.read(4)1542if not slength:1543break # end of value label table (format < 117)1544if self.format_version <= 117:1545labname = self._decode(self.path_or_buf.read(33))1546else:1547labname = self._decode(self.path_or_buf.read(129))1548self.path_or_buf.read(3) # padding15491550n = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]1551txtlen = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]1552off = np.frombuffer(1553self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n1554)1555val = np.frombuffer(1556self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n1557)1558ii = np.argsort(off)1559off = off[ii]1560val = val[ii]1561txt = self.path_or_buf.read(txtlen)1562self.value_label_dict[labname] = {}1563for i in range(n):1564end = off[i + 1] if i < n - 1 else txtlen1565self.value_label_dict[labname][val[i]] = self._decode(txt[off[i] : end])1566if self.format_version >= 117:1567self.path_or_buf.read(6) # </lbl>1568self._value_labels_read = True15691570def _read_strls(self) -> None:1571self.path_or_buf.seek(self.seek_strls)1572# Wrap v_o in a string to allow uint64 values as keys on 32bit OS1573self.GSO = {"0": ""}1574while True:1575if self.path_or_buf.read(3) != b"GSO":1576break15771578if self.format_version == 117:1579v_o = struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0]1580else:1581buf = self.path_or_buf.read(12)1582# Only tested on little endian file on little endian machine.1583v_size = 2 if self.format_version == 118 else 31584if self.byteorder == "<":1585buf = buf[0:v_size] + buf[4 : (12 - v_size)]1586else:1587# This path may not be correct, impossible to test1588buf = buf[0:v_size] + buf[(4 + v_size) :]1589v_o = struct.unpack("Q", buf)[0]1590typ = struct.unpack("B", self.path_or_buf.read(1))[0]1591length = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0]1592va = self.path_or_buf.read(length)1593if typ == 130:1594decoded_va = va[0:-1].decode(self._encoding)1595else:1596# Stata says typ 129 can be binary, so use str1597decoded_va = str(va)1598# Wrap v_o in a string to allow uint64 values as keys on 32bit OS1599self.GSO[str(v_o)] = decoded_va16001601def __next__(self) -> DataFrame:1602self._using_iterator = True1603return self.read(nrows=self._chunksize)16041605def get_chunk(self, size: int | None = None) -> DataFrame:1606"""1607Reads lines from Stata file and returns as dataframe16081609Parameters1610----------1611size : int, defaults to None1612Number of lines to read. If None, reads whole file.16131614Returns1615-------1616DataFrame1617"""1618if size is None:1619size = self._chunksize1620return self.read(nrows=size)16211622@Appender(_read_method_doc)1623def read(1624self,1625nrows: int | None = None,1626convert_dates: bool | None = None,1627convert_categoricals: bool | None = None,1628index_col: str | None = None,1629convert_missing: bool | None = None,1630preserve_dtypes: bool | None = None,1631columns: Sequence[str] | None = None,1632order_categoricals: bool | None = None,1633) -> DataFrame:1634# Handle empty file or chunk. If reading incrementally raise1635# StopIteration. If reading the whole thing return an empty1636# data frame.1637if (self.nobs == 0) and (nrows is None):1638self._can_read_value_labels = True1639self._data_read = True1640self.close()1641return DataFrame(columns=self.varlist)16421643# Handle options1644if convert_dates is None:1645convert_dates = self._convert_dates1646if convert_categoricals is None:1647convert_categoricals = self._convert_categoricals1648if convert_missing is None:1649convert_missing = self._convert_missing1650if preserve_dtypes is None:1651preserve_dtypes = self._preserve_dtypes1652if columns is None:1653columns = self._columns1654if order_categoricals is None:1655order_categoricals = self._order_categoricals1656if index_col is None:1657index_col = self._index_col16581659if nrows is None:1660nrows = self.nobs16611662if (self.format_version >= 117) and (not self._value_labels_read):1663self._can_read_value_labels = True1664self._read_strls()16651666# Read data1667assert self._dtype is not None1668dtype = self._dtype1669max_read_len = (self.nobs - self._lines_read) * dtype.itemsize1670read_len = nrows * dtype.itemsize1671read_len = min(read_len, max_read_len)1672if read_len <= 0:1673# Iterator has finished, should never be here unless1674# we are reading the file incrementally1675if convert_categoricals:1676self._read_value_labels()1677self.close()1678raise StopIteration1679offset = self._lines_read * dtype.itemsize1680self.path_or_buf.seek(self.data_location + offset)1681read_lines = min(nrows, self.nobs - self._lines_read)1682raw_data = np.frombuffer(1683self.path_or_buf.read(read_len), dtype=dtype, count=read_lines1684)16851686self._lines_read += read_lines1687if self._lines_read == self.nobs:1688self._can_read_value_labels = True1689self._data_read = True1690# if necessary, swap the byte order to native here1691if self.byteorder != self._native_byteorder:1692raw_data = raw_data.byteswap().newbyteorder()16931694if convert_categoricals:1695self._read_value_labels()16961697if len(raw_data) == 0:1698data = DataFrame(columns=self.varlist)1699else:1700data = DataFrame.from_records(raw_data)1701data.columns = Index(self.varlist)17021703# If index is not specified, use actual row number rather than1704# restarting at 0 for each chunk.1705if index_col is None:1706rng = np.arange(self._lines_read - read_lines, self._lines_read)1707data.index = Index(rng) # set attr instead of set_index to avoid copy17081709if columns is not None:1710try:1711data = self._do_select_columns(data, columns)1712except ValueError:1713self.close()1714raise17151716# Decode strings1717for col, typ in zip(data, self.typlist):1718if type(typ) is int:1719data[col] = data[col].apply(self._decode, convert_dtype=True)17201721data = self._insert_strls(data)17221723cols_ = np.where([dtyp is not None for dtyp in self.dtyplist])[0]1724# Convert columns (if needed) to match input type1725ix = data.index1726requires_type_conversion = False1727data_formatted = []1728for i in cols_:1729if self.dtyplist[i] is not None:1730col = data.columns[i]1731dtype = data[col].dtype1732if dtype != np.dtype(object) and dtype != self.dtyplist[i]:1733requires_type_conversion = True1734data_formatted.append(1735(col, Series(data[col], ix, self.dtyplist[i]))1736)1737else:1738data_formatted.append((col, data[col]))1739if requires_type_conversion:1740data = DataFrame.from_dict(dict(data_formatted))1741del data_formatted17421743data = self._do_convert_missing(data, convert_missing)17441745if convert_dates:17461747def any_startswith(x: str) -> bool:1748return any(x.startswith(fmt) for fmt in _date_formats)17491750cols = np.where([any_startswith(x) for x in self.fmtlist])[0]1751for i in cols:1752col = data.columns[i]1753try:1754data[col] = _stata_elapsed_date_to_datetime_vec(1755data[col], self.fmtlist[i]1756)1757except ValueError:1758self.close()1759raise17601761if convert_categoricals and self.format_version > 108:1762data = self._do_convert_categoricals(1763data, self.value_label_dict, self.lbllist, order_categoricals1764)17651766if not preserve_dtypes:1767retyped_data = []1768convert = False1769for col in data:1770dtype = data[col].dtype1771if dtype in (np.dtype(np.float16), np.dtype(np.float32)):1772dtype = np.dtype(np.float64)1773convert = True1774elif dtype in (1775np.dtype(np.int8),1776np.dtype(np.int16),1777np.dtype(np.int32),1778):1779dtype = np.dtype(np.int64)1780convert = True1781retyped_data.append((col, data[col].astype(dtype)))1782if convert:1783data = DataFrame.from_dict(dict(retyped_data))17841785if index_col is not None:1786data = data.set_index(data.pop(index_col))17871788return data17891790def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:1791# Check for missing values, and replace if found1792replacements = {}1793for i, colname in enumerate(data):1794fmt = self.typlist[i]1795if fmt not in self.VALID_RANGE:1796continue17971798fmt = cast(str, fmt) # only strs in VALID_RANGE1799nmin, nmax = self.VALID_RANGE[fmt]1800series = data[colname]18011802# appreciably faster to do this with ndarray instead of Series1803svals = series._values1804missing = (svals < nmin) | (svals > nmax)18051806if not missing.any():1807continue18081809if convert_missing: # Replacement follows Stata notation1810missing_loc = np.nonzero(np.asarray(missing))[0]1811umissing, umissing_loc = np.unique(series[missing], return_inverse=True)1812replacement = Series(series, dtype=object)1813for j, um in enumerate(umissing):1814missing_value = StataMissingValue(um)18151816loc = missing_loc[umissing_loc == j]1817replacement.iloc[loc] = missing_value1818else: # All replacements are identical1819dtype = series.dtype1820if dtype not in (np.float32, np.float64):1821dtype = np.float641822replacement = Series(series, dtype=dtype)1823if not replacement._values.flags["WRITEABLE"]:1824# only relevant for ArrayManager; construction1825# path for BlockManager ensures writeability1826replacement = replacement.copy()1827# Note: operating on ._values is much faster than directly1828# TODO: can we fix that?1829replacement._values[missing] = np.nan1830replacements[colname] = replacement18311832if replacements:1833for col in replacements:1834data[col] = replacements[col]1835return data18361837def _insert_strls(self, data: DataFrame) -> DataFrame:1838if not hasattr(self, "GSO") or len(self.GSO) == 0:1839return data1840for i, typ in enumerate(self.typlist):1841if typ != "Q":1842continue1843# Wrap v_o in a string to allow uint64 values as keys on 32bit OS1844data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]]1845return data18461847def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFrame:18481849if not self._column_selector_set:1850column_set = set(columns)1851if len(column_set) != len(columns):1852raise ValueError("columns contains duplicate entries")1853unmatched = column_set.difference(data.columns)1854if unmatched:1855joined = ", ".join(list(unmatched))1856raise ValueError(1857"The following columns were not "1858f"found in the Stata data set: {joined}"1859)1860# Copy information for retained columns for later processing1861dtyplist = []1862typlist = []1863fmtlist = []1864lbllist = []1865for col in columns:1866i = data.columns.get_loc(col)1867dtyplist.append(self.dtyplist[i])1868typlist.append(self.typlist[i])1869fmtlist.append(self.fmtlist[i])1870lbllist.append(self.lbllist[i])18711872self.dtyplist = dtyplist1873self.typlist = typlist1874self.fmtlist = fmtlist1875self.lbllist = lbllist1876self._column_selector_set = True18771878return data[columns]18791880def _do_convert_categoricals(1881self,1882data: DataFrame,1883value_label_dict: dict[str, dict[float | int, str]],1884lbllist: Sequence[str],1885order_categoricals: bool,1886) -> DataFrame:1887"""1888Converts categorical columns to Categorical type.1889"""1890value_labels = list(value_label_dict.keys())1891cat_converted_data = []1892for col, label in zip(data, lbllist):1893if label in value_labels:1894# Explicit call with ordered=True1895vl = value_label_dict[label]1896keys = np.array(list(vl.keys()))1897column = data[col]1898key_matches = column.isin(keys)1899if self._using_iterator and key_matches.all():1900initial_categories: np.ndarray | None = keys1901# If all categories are in the keys and we are iterating,1902# use the same keys for all chunks. If some are missing1903# value labels, then we will fall back to the categories1904# varying across chunks.1905else:1906if self._using_iterator:1907# warn is using an iterator1908warnings.warn(1909categorical_conversion_warning, CategoricalConversionWarning1910)1911initial_categories = None1912cat_data = Categorical(1913column, categories=initial_categories, ordered=order_categoricals1914)1915if initial_categories is None:1916# If None here, then we need to match the cats in the Categorical1917categories = []1918for category in cat_data.categories:1919if category in vl:1920categories.append(vl[category])1921else:1922categories.append(category)1923else:1924# If all cats are matched, we can use the values1925categories = list(vl.values())1926try:1927# Try to catch duplicate categories1928cat_data.categories = categories1929except ValueError as err:1930vc = Series(categories).value_counts()1931repeated_cats = list(vc.index[vc > 1])1932repeats = "-" * 80 + "\n" + "\n".join(repeated_cats)1933# GH 257721934msg = f"""1935Value labels for column {col} are not unique. These cannot be converted to1936pandas categoricals.19371938Either read the file with `convert_categoricals` set to False or use the1939low level interface in `StataReader` to separately read the values and the1940value_labels.19411942The repeated labels are:1943{repeats}1944"""1945raise ValueError(msg) from err1946# TODO: is the next line needed above in the data(...) method?1947cat_series = Series(cat_data, index=data.index)1948cat_converted_data.append((col, cat_series))1949else:1950cat_converted_data.append((col, data[col]))1951data = DataFrame(dict(cat_converted_data), copy=False)1952return data19531954@property1955def data_label(self) -> str:1956"""1957Return data label of Stata file.1958"""1959return self._data_label19601961def variable_labels(self) -> dict[str, str]:1962"""1963Return variable labels as a dict, associating each variable name1964with corresponding label.19651966Returns1967-------1968dict1969"""1970return dict(zip(self.varlist, self._variable_labels))19711972def value_labels(self) -> dict[str, dict[float | int, str]]:1973"""1974Return a dict, associating each variable name a dict, associating1975each value its corresponding label.19761977Returns1978-------1979dict1980"""1981if not self._value_labels_read:1982self._read_value_labels()19831984return self.value_label_dict198519861987@Appender(_read_stata_doc)1988def read_stata(1989filepath_or_buffer: FilePath | ReadBuffer[bytes],1990convert_dates: bool = True,1991convert_categoricals: bool = True,1992index_col: str | None = None,1993convert_missing: bool = False,1994preserve_dtypes: bool = True,1995columns: Sequence[str] | None = None,1996order_categoricals: bool = True,1997chunksize: int | None = None,1998iterator: bool = False,1999compression: CompressionOptions = "infer",2000storage_options: StorageOptions = None,2001) -> DataFrame | StataReader:20022003reader = StataReader(2004filepath_or_buffer,2005convert_dates=convert_dates,2006convert_categoricals=convert_categoricals,2007index_col=index_col,2008convert_missing=convert_missing,2009preserve_dtypes=preserve_dtypes,2010columns=columns,2011order_categoricals=order_categoricals,2012chunksize=chunksize,2013storage_options=storage_options,2014compression=compression,2015)20162017if iterator or chunksize:2018return reader20192020with reader:2021return reader.read()202220232024def _set_endianness(endianness: str) -> str:2025if endianness.lower() in ["<", "little"]:2026return "<"2027elif endianness.lower() in [">", "big"]:2028return ">"2029else: # pragma : no cover2030raise ValueError(f"Endianness {endianness} not understood")203120322033def _pad_bytes(name: AnyStr, length: int) -> AnyStr:2034"""2035Take a char string and pads it with null bytes until it's length chars.2036"""2037if isinstance(name, bytes):2038return name + b"\x00" * (length - len(name))2039return name + "\x00" * (length - len(name))204020412042def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:2043"""2044Convert from one of the stata date formats to a type in TYPE_MAP.2045"""2046if fmt in [2047"tc",2048"%tc",2049"td",2050"%td",2051"tw",2052"%tw",2053"tm",2054"%tm",2055"tq",2056"%tq",2057"th",2058"%th",2059"ty",2060"%ty",2061]:2062return np.dtype(np.float64) # Stata expects doubles for SIFs2063else:2064raise NotImplementedError(f"Format {fmt} not implemented")206520662067def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict:2068new_dict = {}2069for key in convert_dates:2070if not convert_dates[key].startswith("%"): # make sure proper fmts2071convert_dates[key] = "%" + convert_dates[key]2072if key in varlist:2073new_dict.update({varlist.index(key): convert_dates[key]})2074else:2075if not isinstance(key, int):2076raise ValueError("convert_dates key must be a column or an integer")2077new_dict.update({key: convert_dates[key]})2078return new_dict207920802081def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int:2082"""2083Convert dtype types to stata types. Returns the byte of the given ordinal.2084See TYPE_MAP and comments for an explanation. This is also explained in2085the dta spec.20861 - 244 are strings of this length2087Pandas Stata2088251 - for int8 byte2089252 - for int16 int2090253 - for int32 long2091254 - for float32 float2092255 - for double double20932094If there are dates to convert, then dtype will already have the correct2095type inserted.2096"""2097# TODO: expand to handle datetime to integer conversion2098if dtype.type is np.object_: # try to coerce it to the biggest string2099# not memory efficient, what else could we2100# do?2101itemsize = max_len_string_array(ensure_object(column._values))2102return max(itemsize, 1)2103elif dtype.type is np.float64:2104return 2552105elif dtype.type is np.float32:2106return 2542107elif dtype.type is np.int32:2108return 2532109elif dtype.type is np.int16:2110return 2522111elif dtype.type is np.int8:2112return 2512113else: # pragma : no cover2114raise NotImplementedError(f"Data type {dtype} not supported.")211521162117def _dtype_to_default_stata_fmt(2118dtype, column: Series, dta_version: int = 114, force_strl: bool = False2119) -> str:2120"""2121Map numpy dtype to stata's default format for this type. Not terribly2122important since users can change this in Stata. Semantics are21232124object -> "%DDs" where DD is the length of the string. If not a string,2125raise ValueError2126float64 -> "%10.0g"2127float32 -> "%9.0g"2128int64 -> "%9.0g"2129int32 -> "%12.0g"2130int16 -> "%8.0g"2131int8 -> "%8.0g"2132strl -> "%9s"2133"""2134# TODO: Refactor to combine type with format2135# TODO: expand this to handle a default datetime format?2136if dta_version < 117:2137max_str_len = 2442138else:2139max_str_len = 20452140if force_strl:2141return "%9s"2142if dtype.type is np.object_:2143itemsize = max_len_string_array(ensure_object(column._values))2144if itemsize > max_str_len:2145if dta_version >= 117:2146return "%9s"2147else:2148raise ValueError(excessive_string_length_error.format(column.name))2149return "%" + str(max(itemsize, 1)) + "s"2150elif dtype == np.float64:2151return "%10.0g"2152elif dtype == np.float32:2153return "%9.0g"2154elif dtype == np.int32:2155return "%12.0g"2156elif dtype == np.int8 or dtype == np.int16:2157return "%8.0g"2158else: # pragma : no cover2159raise NotImplementedError(f"Data type {dtype} not supported.")216021612162@doc(2163storage_options=_shared_docs["storage_options"],2164compression_options=_shared_docs["compression_options"] % "fname",2165)2166class StataWriter(StataParser):2167"""2168A class for writing Stata binary dta files21692170Parameters2171----------2172fname : path (string), buffer or path object2173string, path object (pathlib.Path or py._path.local.LocalPath) or2174object implementing a binary write() functions. If using a buffer2175then the buffer will not be automatically closed after the file2176is written.2177data : DataFrame2178Input to save2179convert_dates : dict2180Dictionary mapping columns containing datetime types to stata internal2181format to use when writing the dates. Options are 'tc', 'td', 'tm',2182'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.2183Datetime columns that do not have a conversion type specified will be2184converted to 'tc'. Raises NotImplementedError if a datetime column has2185timezone information2186write_index : bool2187Write the index to Stata dataset.2188byteorder : str2189Can be ">", "<", "little", or "big". default is `sys.byteorder`2190time_stamp : datetime2191A datetime to use as file creation date. Default is the current time2192data_label : str2193A label for the data set. Must be 80 characters or smaller.2194variable_labels : dict2195Dictionary containing columns as keys and variable labels as values.2196Each label must be 80 characters or smaller.2197{compression_options}21982199.. versionadded:: 1.1.022002201.. versionchanged:: 1.4.0 Zstandard support.22022203{storage_options}22042205.. versionadded:: 1.2.022062207value_labels : dict of dicts2208Dictionary containing columns as keys and dictionaries of column value2209to labels as values. The combined length of all labels for a single2210variable must be 32,000 characters or smaller.22112212.. versionadded:: 1.4.022132214Returns2215-------2216writer : StataWriter instance2217The StataWriter instance has a write_file method, which will2218write the file to the given `fname`.22192220Raises2221------2222NotImplementedError2223* If datetimes contain timezone information2224ValueError2225* Columns listed in convert_dates are neither datetime64[ns]2226or datetime.datetime2227* Column dtype is not representable in Stata2228* Column listed in convert_dates is not in DataFrame2229* Categorical label contains more than 32,000 characters22302231Examples2232--------2233>>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b'])2234>>> writer = StataWriter('./data_file.dta', data)2235>>> writer.write_file()22362237Directly write a zip file2238>>> compression = {{"method": "zip", "archive_name": "data_file.dta"}}2239>>> writer = StataWriter('./data_file.zip', data, compression=compression)2240>>> writer.write_file()22412242Save a DataFrame with dates2243>>> from datetime import datetime2244>>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date'])2245>>> writer = StataWriter('./date_data_file.dta', data, {{'date' : 'tw'}})2246>>> writer.write_file()2247"""22482249_max_string_length = 2442250_encoding = "latin-1"22512252def __init__(2253self,2254fname: FilePath | WriteBuffer[bytes],2255data: DataFrame,2256convert_dates: dict[Hashable, str] | None = None,2257write_index: bool = True,2258byteorder: str | None = None,2259time_stamp: datetime.datetime | None = None,2260data_label: str | None = None,2261variable_labels: dict[Hashable, str] | None = None,2262compression: CompressionOptions = "infer",2263storage_options: StorageOptions = None,2264*,2265value_labels: dict[Hashable, dict[float | int, str]] | None = None,2266):2267super().__init__()2268self.data = data2269self._convert_dates = {} if convert_dates is None else convert_dates2270self._write_index = write_index2271self._time_stamp = time_stamp2272self._data_label = data_label2273self._variable_labels = variable_labels2274self._non_cat_value_labels = value_labels2275self._value_labels: list[StataValueLabel] = []2276self._has_value_labels = np.array([], dtype=bool)2277self._compression = compression2278self._output_file: IO[bytes] | None = None2279self._converted_names: dict[Hashable, str] = {}2280# attach nobs, nvars, data, varlist, typlist2281self._prepare_pandas(data)2282self.storage_options = storage_options22832284if byteorder is None:2285byteorder = sys.byteorder2286self._byteorder = _set_endianness(byteorder)2287self._fname = fname2288self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}22892290def _write(self, to_write: str) -> None:2291"""2292Helper to call encode before writing to file for Python 3 compat.2293"""2294self.handles.handle.write(to_write.encode(self._encoding))22952296def _write_bytes(self, value: bytes) -> None:2297"""2298Helper to assert file is open before writing.2299"""2300self.handles.handle.write(value)23012302def _prepare_non_cat_value_labels(2303self, data: DataFrame2304) -> list[StataNonCatValueLabel]:2305"""2306Check for value labels provided for non-categorical columns. Value2307labels2308"""2309non_cat_value_labels: list[StataNonCatValueLabel] = []2310if self._non_cat_value_labels is None:2311return non_cat_value_labels23122313for labname, labels in self._non_cat_value_labels.items():2314if labname in self._converted_names:2315colname = self._converted_names[labname]2316elif labname in data.columns:2317colname = str(labname)2318else:2319raise KeyError(2320f"Can't create value labels for {labname}, it wasn't "2321"found in the dataset."2322)23232324if not is_numeric_dtype(data[colname].dtype):2325# Labels should not be passed explicitly for categorical2326# columns that will be converted to int2327raise ValueError(2328f"Can't create value labels for {labname}, value labels "2329"can only be applied to numeric columns."2330)2331svl = StataNonCatValueLabel(colname, labels)2332non_cat_value_labels.append(svl)2333return non_cat_value_labels23342335def _prepare_categoricals(self, data: DataFrame) -> DataFrame:2336"""2337Check for categorical columns, retain categorical information for2338Stata file and convert categorical data to int2339"""2340is_cat = [is_categorical_dtype(data[col].dtype) for col in data]2341if not any(is_cat):2342return data23432344self._has_value_labels |= np.array(is_cat)23452346get_base_missing_value = StataMissingValue.get_base_missing_value2347data_formatted = []2348for col, col_is_cat in zip(data, is_cat):2349if col_is_cat:2350svl = StataValueLabel(data[col], encoding=self._encoding)2351self._value_labels.append(svl)2352dtype = data[col].cat.codes.dtype2353if dtype == np.int64:2354raise ValueError(2355"It is not possible to export "2356"int64-based categorical data to Stata."2357)2358values = data[col].cat.codes._values.copy()23592360# Upcast if needed so that correct missing values can be set2361if values.max() >= get_base_missing_value(dtype):2362if dtype == np.int8:2363dtype = np.dtype(np.int16)2364elif dtype == np.int16:2365dtype = np.dtype(np.int32)2366else:2367dtype = np.dtype(np.float64)2368values = np.array(values, dtype=dtype)23692370# Replace missing values with Stata missing value for type2371values[values == -1] = get_base_missing_value(dtype)2372data_formatted.append((col, values))2373else:2374data_formatted.append((col, data[col]))2375return DataFrame.from_dict(dict(data_formatted))23762377def _replace_nans(self, data: DataFrame) -> DataFrame:2378# return data2379"""2380Checks floating point data columns for nans, and replaces these with2381the generic Stata for missing value (.)2382"""2383for c in data:2384dtype = data[c].dtype2385if dtype in (np.float32, np.float64):2386if dtype == np.float32:2387replacement = self.MISSING_VALUES["f"]2388else:2389replacement = self.MISSING_VALUES["d"]2390data[c] = data[c].fillna(replacement)23912392return data23932394def _update_strl_names(self) -> None:2395"""No-op, forward compatibility"""2396pass23972398def _validate_variable_name(self, name: str) -> str:2399"""2400Validate variable names for Stata export.24012402Parameters2403----------2404name : str2405Variable name24062407Returns2408-------2409str2410The validated name with invalid characters replaced with2411underscores.24122413Notes2414-----2415Stata 114 and 117 support ascii characters in a-z, A-Z, 0-92416and _.2417"""2418for c in name:2419if (2420(c < "A" or c > "Z")2421and (c < "a" or c > "z")2422and (c < "0" or c > "9")2423and c != "_"2424):2425name = name.replace(c, "_")2426return name24272428def _check_column_names(self, data: DataFrame) -> DataFrame:2429"""2430Checks column names to ensure that they are valid Stata column names.2431This includes checks for:2432* Non-string names2433* Stata keywords2434* Variables that start with numbers2435* Variables with names that are too long24362437When an illegal variable name is detected, it is converted, and if2438dates are exported, the variable name is propagated to the date2439conversion dictionary2440"""2441converted_names: dict[Hashable, str] = {}2442columns = list(data.columns)2443original_columns = columns[:]24442445duplicate_var_id = 02446for j, name in enumerate(columns):2447orig_name = name2448if not isinstance(name, str):2449name = str(name)24502451name = self._validate_variable_name(name)24522453# Variable name must not be a reserved word2454if name in self.RESERVED_WORDS:2455name = "_" + name24562457# Variable name may not start with a number2458if "0" <= name[0] <= "9":2459name = "_" + name24602461name = name[: min(len(name), 32)]24622463if not name == orig_name:2464# check for duplicates2465while columns.count(name) > 0:2466# prepend ascending number to avoid duplicates2467name = "_" + str(duplicate_var_id) + name2468name = name[: min(len(name), 32)]2469duplicate_var_id += 12470converted_names[orig_name] = name24712472columns[j] = name24732474data.columns = Index(columns)24752476# Check date conversion, and fix key if needed2477if self._convert_dates:2478for c, o in zip(columns, original_columns):2479if c != o:2480self._convert_dates[c] = self._convert_dates[o]2481del self._convert_dates[o]24822483if converted_names:2484conversion_warning = []2485for orig_name, name in converted_names.items():2486msg = f"{orig_name} -> {name}"2487conversion_warning.append(msg)24882489ws = invalid_name_doc.format("\n ".join(conversion_warning))2490warnings.warn(ws, InvalidColumnName)24912492self._converted_names = converted_names2493self._update_strl_names()24942495return data24962497def _set_formats_and_types(self, dtypes: Series) -> None:2498self.fmtlist: list[str] = []2499self.typlist: list[int] = []2500for col, dtype in dtypes.items():2501self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col]))2502self.typlist.append(_dtype_to_stata_type(dtype, self.data[col]))25032504def _prepare_pandas(self, data: DataFrame) -> None:2505# NOTE: we might need a different API / class for pandas objects so2506# we can set different semantics - handle this with a PR to pandas.io25072508data = data.copy()25092510if self._write_index:2511temp = data.reset_index()2512if isinstance(temp, DataFrame):2513data = temp25142515# Ensure column names are strings2516data = self._check_column_names(data)25172518# Check columns for compatibility with stata, upcast if necessary2519# Raise if outside the supported range2520data = _cast_to_stata_types(data)25212522# Replace NaNs with Stata missing values2523data = self._replace_nans(data)25242525# Set all columns to initially unlabelled2526self._has_value_labels = np.repeat(False, data.shape[1])25272528# Create value labels for non-categorical data2529non_cat_value_labels = self._prepare_non_cat_value_labels(data)25302531non_cat_columns = [svl.labname for svl in non_cat_value_labels]2532has_non_cat_val_labels = data.columns.isin(non_cat_columns)2533self._has_value_labels |= has_non_cat_val_labels2534self._value_labels.extend(non_cat_value_labels)25352536# Convert categoricals to int data, and strip labels2537data = self._prepare_categoricals(data)25382539self.nobs, self.nvar = data.shape2540self.data = data2541self.varlist = data.columns.tolist()25422543dtypes = data.dtypes25442545# Ensure all date columns are converted2546for col in data:2547if col in self._convert_dates:2548continue2549if is_datetime64_dtype(data[col]):2550self._convert_dates[col] = "tc"25512552self._convert_dates = _maybe_convert_to_int_keys(2553self._convert_dates, self.varlist2554)2555for key in self._convert_dates:2556new_type = _convert_datetime_to_stata_type(self._convert_dates[key])2557dtypes[key] = np.dtype(new_type)25582559# Verify object arrays are strings and encode to bytes2560self._encode_strings()25612562self._set_formats_and_types(dtypes)25632564# set the given format for the datetime cols2565if self._convert_dates is not None:2566for key in self._convert_dates:2567if isinstance(key, int):2568self.fmtlist[key] = self._convert_dates[key]25692570def _encode_strings(self) -> None:2571"""2572Encode strings in dta-specific encoding25732574Do not encode columns marked for date conversion or for strL2575conversion. The strL converter independently handles conversion and2576also accepts empty string arrays.2577"""2578convert_dates = self._convert_dates2579# _convert_strl is not available in dta 1142580convert_strl = getattr(self, "_convert_strl", [])2581for i, col in enumerate(self.data):2582# Skip columns marked for date conversion or strl conversion2583if i in convert_dates or col in convert_strl:2584continue2585column = self.data[col]2586dtype = column.dtype2587if dtype.type is np.object_:2588inferred_dtype = infer_dtype(column, skipna=True)2589if not ((inferred_dtype == "string") or len(column) == 0):2590col = column.name2591raise ValueError(2592f"""\2593Column `{col}` cannot be exported.\n\nOnly string-like object arrays2594containing all strings or a mix of strings and None can be exported.2595Object arrays containing only null values are prohibited. Other object2596types cannot be exported and must first be converted to one of the2597supported types."""2598)2599encoded = self.data[col].str.encode(self._encoding)2600# If larger than _max_string_length do nothing2601if (2602max_len_string_array(ensure_object(encoded._values))2603<= self._max_string_length2604):2605self.data[col] = encoded26062607def write_file(self) -> None:2608"""2609Export DataFrame object to Stata dta format.2610"""2611with get_handle(2612self._fname,2613"wb",2614compression=self._compression,2615is_text=False,2616storage_options=self.storage_options,2617) as self.handles:26182619if self.handles.compression["method"] is not None:2620# ZipFile creates a file (with the same name) for each write call.2621# Write it first into a buffer and then write the buffer to the ZipFile.2622self._output_file, self.handles.handle = self.handles.handle, BytesIO()2623self.handles.created_handles.append(self.handles.handle)26242625try:2626self._write_header(2627data_label=self._data_label, time_stamp=self._time_stamp2628)2629self._write_map()2630self._write_variable_types()2631self._write_varnames()2632self._write_sortlist()2633self._write_formats()2634self._write_value_label_names()2635self._write_variable_labels()2636self._write_expansion_fields()2637self._write_characteristics()2638records = self._prepare_data()2639self._write_data(records)2640self._write_strls()2641self._write_value_labels()2642self._write_file_close_tag()2643self._write_map()2644self._close()2645except Exception as exc:2646self.handles.close()2647if isinstance(self._fname, (str, os.PathLike)) and os.path.isfile(2648self._fname2649):2650try:2651os.unlink(self._fname)2652except OSError:2653warnings.warn(2654f"This save was not successful but {self._fname} could not "2655"be deleted. This file is not valid.",2656ResourceWarning,2657)2658raise exc26592660def _close(self) -> None:2661"""2662Close the file if it was created by the writer.26632664If a buffer or file-like object was passed in, for example a GzipFile,2665then leave this file open for the caller to close.2666"""2667# write compression2668if self._output_file is not None:2669assert isinstance(self.handles.handle, BytesIO)2670bio, self.handles.handle = self.handles.handle, self._output_file2671self.handles.handle.write(bio.getvalue())26722673def _write_map(self) -> None:2674"""No-op, future compatibility"""2675pass26762677def _write_file_close_tag(self) -> None:2678"""No-op, future compatibility"""2679pass26802681def _write_characteristics(self) -> None:2682"""No-op, future compatibility"""2683pass26842685def _write_strls(self) -> None:2686"""No-op, future compatibility"""2687pass26882689def _write_expansion_fields(self) -> None:2690"""Write 5 zeros for expansion fields"""2691self._write(_pad_bytes("", 5))26922693def _write_value_labels(self) -> None:2694for vl in self._value_labels:2695self._write_bytes(vl.generate_value_label(self._byteorder))26962697def _write_header(2698self,2699data_label: str | None = None,2700time_stamp: datetime.datetime | None = None,2701) -> None:2702byteorder = self._byteorder2703# ds_format - just use 1142704self._write_bytes(struct.pack("b", 114))2705# byteorder2706self._write(byteorder == ">" and "\x01" or "\x02")2707# filetype2708self._write("\x01")2709# unused2710self._write("\x00")2711# number of vars, 2 bytes2712self._write_bytes(struct.pack(byteorder + "h", self.nvar)[:2])2713# number of obs, 4 bytes2714self._write_bytes(struct.pack(byteorder + "i", self.nobs)[:4])2715# data label 81 bytes, char, null terminated2716if data_label is None:2717self._write_bytes(self._null_terminate_bytes(_pad_bytes("", 80)))2718else:2719self._write_bytes(2720self._null_terminate_bytes(_pad_bytes(data_label[:80], 80))2721)2722# time stamp, 18 bytes, char, null terminated2723# format dd Mon yyyy hh:mm2724if time_stamp is None:2725time_stamp = datetime.datetime.now()2726elif not isinstance(time_stamp, datetime.datetime):2727raise ValueError("time_stamp should be datetime type")2728# GH #138562729# Avoid locale-specific month conversion2730months = [2731"Jan",2732"Feb",2733"Mar",2734"Apr",2735"May",2736"Jun",2737"Jul",2738"Aug",2739"Sep",2740"Oct",2741"Nov",2742"Dec",2743]2744month_lookup = {i + 1: month for i, month in enumerate(months)}2745ts = (2746time_stamp.strftime("%d ")2747+ month_lookup[time_stamp.month]2748+ time_stamp.strftime(" %Y %H:%M")2749)2750self._write_bytes(self._null_terminate_bytes(ts))27512752def _write_variable_types(self) -> None:2753for typ in self.typlist:2754self._write_bytes(struct.pack("B", typ))27552756def _write_varnames(self) -> None:2757# varlist names are checked by _check_column_names2758# varlist, requires null terminated2759for name in self.varlist:2760name = self._null_terminate_str(name)2761name = _pad_bytes(name[:32], 33)2762self._write(name)27632764def _write_sortlist(self) -> None:2765# srtlist, 2*(nvar+1), int array, encoded by byteorder2766srtlist = _pad_bytes("", 2 * (self.nvar + 1))2767self._write(srtlist)27682769def _write_formats(self) -> None:2770# fmtlist, 49*nvar, char array2771for fmt in self.fmtlist:2772self._write(_pad_bytes(fmt, 49))27732774def _write_value_label_names(self) -> None:2775# lbllist, 33*nvar, char array2776for i in range(self.nvar):2777# Use variable name when categorical2778if self._has_value_labels[i]:2779name = self.varlist[i]2780name = self._null_terminate_str(name)2781name = _pad_bytes(name[:32], 33)2782self._write(name)2783else: # Default is empty label2784self._write(_pad_bytes("", 33))27852786def _write_variable_labels(self) -> None:2787# Missing labels are 80 blank characters plus null termination2788blank = _pad_bytes("", 81)27892790if self._variable_labels is None:2791for i in range(self.nvar):2792self._write(blank)2793return27942795for col in self.data:2796if col in self._variable_labels:2797label = self._variable_labels[col]2798if len(label) > 80:2799raise ValueError("Variable labels must be 80 characters or fewer")2800is_latin1 = all(ord(c) < 256 for c in label)2801if not is_latin1:2802raise ValueError(2803"Variable labels must contain only characters that "2804"can be encoded in Latin-1"2805)2806self._write(_pad_bytes(label, 81))2807else:2808self._write(blank)28092810def _convert_strls(self, data: DataFrame) -> DataFrame:2811"""No-op, future compatibility"""2812return data28132814def _prepare_data(self) -> np.recarray:2815data = self.data2816typlist = self.typlist2817convert_dates = self._convert_dates2818# 1. Convert dates2819if self._convert_dates is not None:2820for i, col in enumerate(data):2821if i in convert_dates:2822data[col] = _datetime_to_stata_elapsed_vec(2823data[col], self.fmtlist[i]2824)2825# 2. Convert strls2826data = self._convert_strls(data)28272828# 3. Convert bad string data to '' and pad to correct length2829dtypes = {}2830native_byteorder = self._byteorder == _set_endianness(sys.byteorder)2831for i, col in enumerate(data):2832typ = typlist[i]2833if typ <= self._max_string_length:2834data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,))2835stype = f"S{typ}"2836dtypes[col] = stype2837data[col] = data[col].astype(stype)2838else:2839dtype = data[col].dtype2840if not native_byteorder:2841dtype = dtype.newbyteorder(self._byteorder)2842dtypes[col] = dtype28432844return data.to_records(index=False, column_dtypes=dtypes)28452846def _write_data(self, records: np.recarray) -> None:2847self._write_bytes(records.tobytes())28482849@staticmethod2850def _null_terminate_str(s: str) -> str:2851s += "\x00"2852return s28532854def _null_terminate_bytes(self, s: str) -> bytes:2855return self._null_terminate_str(s).encode(self._encoding)285628572858def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) -> int:2859"""2860Converts dtype types to stata types. Returns the byte of the given ordinal.2861See TYPE_MAP and comments for an explanation. This is also explained in2862the dta spec.28631 - 2045 are strings of this length2864Pandas Stata286532768 - for object strL286665526 - for int8 byte286765527 - for int16 int286865528 - for int32 long286965529 - for float32 float287065530 - for double double28712872If there are dates to convert, then dtype will already have the correct2873type inserted.2874"""2875# TODO: expand to handle datetime to integer conversion2876if force_strl:2877return 327682878if dtype.type is np.object_: # try to coerce it to the biggest string2879# not memory efficient, what else could we2880# do?2881itemsize = max_len_string_array(ensure_object(column._values))2882itemsize = max(itemsize, 1)2883if itemsize <= 2045:2884return itemsize2885return 327682886elif dtype.type is np.float64:2887return 655262888elif dtype.type is np.float32:2889return 655272890elif dtype.type is np.int32:2891return 655282892elif dtype.type is np.int16:2893return 655292894elif dtype.type is np.int8:2895return 655302896else: # pragma : no cover2897raise NotImplementedError(f"Data type {dtype} not supported.")289828992900def _pad_bytes_new(name: str | bytes, length: int) -> bytes:2901"""2902Takes a bytes instance and pads it with null bytes until it's length chars.2903"""2904if isinstance(name, str):2905name = bytes(name, "utf-8")2906return name + b"\x00" * (length - len(name))290729082909class StataStrLWriter:2910"""2911Converter for Stata StrLs29122913Stata StrLs map 8 byte values to strings which are stored using a2914dictionary-like format where strings are keyed to two values.29152916Parameters2917----------2918df : DataFrame2919DataFrame to convert2920columns : Sequence[str]2921List of columns names to convert to StrL2922version : int, optional2923dta version. Currently supports 117, 118 and 1192924byteorder : str, optional2925Can be ">", "<", "little", or "big". default is `sys.byteorder`29262927Notes2928-----2929Supports creation of the StrL block of a dta file for dta versions2930117, 118 and 119. These differ in how the GSO is stored. 118 and2931119 store the GSO lookup value as a uint32 and a uint64, while 1172932uses two uint32s. 118 and 119 also encode all strings as unicode2933which is required by the format. 117 uses 'latin-1' a fixed width2934encoding that extends the 7-bit ascii table with an additional 1282935characters.2936"""29372938def __init__(2939self,2940df: DataFrame,2941columns: Sequence[str],2942version: int = 117,2943byteorder: str | None = None,2944):2945if version not in (117, 118, 119):2946raise ValueError("Only dta versions 117, 118 and 119 supported")2947self._dta_ver = version29482949self.df = df2950self.columns = columns2951self._gso_table = {"": (0, 0)}2952if byteorder is None:2953byteorder = sys.byteorder2954self._byteorder = _set_endianness(byteorder)29552956gso_v_type = "I" # uint322957gso_o_type = "Q" # uint642958self._encoding = "utf-8"2959if version == 117:2960o_size = 42961gso_o_type = "I" # 117 used uint322962self._encoding = "latin-1"2963elif version == 118:2964o_size = 62965else: # version == 1192966o_size = 52967self._o_offet = 2 ** (8 * (8 - o_size))2968self._gso_o_type = gso_o_type2969self._gso_v_type = gso_v_type29702971def _convert_key(self, key: tuple[int, int]) -> int:2972v, o = key2973return v + self._o_offet * o29742975def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]:2976"""2977Generates the GSO lookup table for the DataFrame29782979Returns2980-------2981gso_table : dict2982Ordered dictionary using the string found as keys2983and their lookup position (v,o) as values2984gso_df : DataFrame2985DataFrame where strl columns have been converted to2986(v,o) values29872988Notes2989-----2990Modifies the DataFrame in-place.29912992The DataFrame returned encodes the (v,o) values as uint64s. The2993encoding depends on the dta version, and can be expressed as29942995enc = v + o * 2 ** (o_size * 8)29962997so that v is stored in the lower bits and o is in the upper2998bits. o_size is29993000* 117: 43001* 118: 63002* 119: 53003"""3004gso_table = self._gso_table3005gso_df = self.df3006columns = list(gso_df.columns)3007selected = gso_df[self.columns]3008col_index = [(col, columns.index(col)) for col in self.columns]3009keys = np.empty(selected.shape, dtype=np.uint64)3010for o, (idx, row) in enumerate(selected.iterrows()):3011for j, (col, v) in enumerate(col_index):3012val = row[col]3013# Allow columns with mixed str and None (GH 23633)3014val = "" if val is None else val3015key = gso_table.get(val, None)3016if key is None:3017# Stata prefers human numbers3018key = (v + 1, o + 1)3019gso_table[val] = key3020keys[o, j] = self._convert_key(key)3021for i, col in enumerate(self.columns):3022gso_df[col] = keys[:, i]30233024return gso_table, gso_df30253026def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes:3027"""3028Generates the binary blob of GSOs that is written to the dta file.30293030Parameters3031----------3032gso_table : dict3033Ordered dictionary (str, vo)30343035Returns3036-------3037gso : bytes3038Binary content of dta file to be placed between strl tags30393040Notes3041-----3042Output format depends on dta version. 117 uses two uint32s to3043express v and o while 118+ uses a uint32 for v and a uint64 for o.3044"""3045# Format information3046# Length includes null term3047# 1173048# GSOvvvvooootllllxxxxxxxxxxxxxxx...x3049# 3 u4 u4 u1 u4 string + null term3050#3051# 118, 1193052# GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x3053# 3 u4 u8 u1 u4 string + null term30543055bio = BytesIO()3056gso = bytes("GSO", "ascii")3057gso_type = struct.pack(self._byteorder + "B", 130)3058null = struct.pack(self._byteorder + "B", 0)3059v_type = self._byteorder + self._gso_v_type3060o_type = self._byteorder + self._gso_o_type3061len_type = self._byteorder + "I"3062for strl, vo in gso_table.items():3063if vo == (0, 0):3064continue3065v, o = vo30663067# GSO3068bio.write(gso)30693070# vvvv3071bio.write(struct.pack(v_type, v))30723073# oooo / oooooooo3074bio.write(struct.pack(o_type, o))30753076# t3077bio.write(gso_type)30783079# llll3080utf8_string = bytes(strl, "utf-8")3081bio.write(struct.pack(len_type, len(utf8_string) + 1))30823083# xxx...xxx3084bio.write(utf8_string)3085bio.write(null)30863087return bio.getvalue()308830893090class StataWriter117(StataWriter):3091"""3092A class for writing Stata binary dta files in Stata 13 format (117)30933094Parameters3095----------3096fname : path (string), buffer or path object3097string, path object (pathlib.Path or py._path.local.LocalPath) or3098object implementing a binary write() functions. If using a buffer3099then the buffer will not be automatically closed after the file3100is written.3101data : DataFrame3102Input to save3103convert_dates : dict3104Dictionary mapping columns containing datetime types to stata internal3105format to use when writing the dates. Options are 'tc', 'td', 'tm',3106'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.3107Datetime columns that do not have a conversion type specified will be3108converted to 'tc'. Raises NotImplementedError if a datetime column has3109timezone information3110write_index : bool3111Write the index to Stata dataset.3112byteorder : str3113Can be ">", "<", "little", or "big". default is `sys.byteorder`3114time_stamp : datetime3115A datetime to use as file creation date. Default is the current time3116data_label : str3117A label for the data set. Must be 80 characters or smaller.3118variable_labels : dict3119Dictionary containing columns as keys and variable labels as values.3120Each label must be 80 characters or smaller.3121convert_strl : list3122List of columns names to convert to Stata StrL format. Columns with3123more than 2045 characters are automatically written as StrL.3124Smaller columns can be converted by including the column name. Using3125StrLs can reduce output file size when strings are longer than 83126characters, and either frequently repeated or sparse.3127{compression_options}31283129.. versionadded:: 1.1.031303131.. versionchanged:: 1.4.0 Zstandard support.31323133value_labels : dict of dicts3134Dictionary containing columns as keys and dictionaries of column value3135to labels as values. The combined length of all labels for a single3136variable must be 32,000 characters or smaller.31373138.. versionadded:: 1.4.031393140Returns3141-------3142writer : StataWriter117 instance3143The StataWriter117 instance has a write_file method, which will3144write the file to the given `fname`.31453146Raises3147------3148NotImplementedError3149* If datetimes contain timezone information3150ValueError3151* Columns listed in convert_dates are neither datetime64[ns]3152or datetime.datetime3153* Column dtype is not representable in Stata3154* Column listed in convert_dates is not in DataFrame3155* Categorical label contains more than 32,000 characters31563157Examples3158--------3159>>> from pandas.io.stata import StataWriter1173160>>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c'])3161>>> writer = StataWriter117('./data_file.dta', data)3162>>> writer.write_file()31633164Directly write a zip file3165>>> compression = {"method": "zip", "archive_name": "data_file.dta"}3166>>> writer = StataWriter117('./data_file.zip', data, compression=compression)3167>>> writer.write_file()31683169Or with long strings stored in strl format3170>>> data = pd.DataFrame([['A relatively long string'], [''], ['']],3171... columns=['strls'])3172>>> writer = StataWriter117('./data_file_with_long_strings.dta', data,3173... convert_strl=['strls'])3174>>> writer.write_file()3175"""31763177_max_string_length = 20453178_dta_version = 11731793180def __init__(3181self,3182fname: FilePath | WriteBuffer[bytes],3183data: DataFrame,3184convert_dates: dict[Hashable, str] | None = None,3185write_index: bool = True,3186byteorder: str | None = None,3187time_stamp: datetime.datetime | None = None,3188data_label: str | None = None,3189variable_labels: dict[Hashable, str] | None = None,3190convert_strl: Sequence[Hashable] | None = None,3191compression: CompressionOptions = "infer",3192storage_options: StorageOptions = None,3193*,3194value_labels: dict[Hashable, dict[float | int, str]] | None = None,3195):3196# Copy to new list since convert_strl might be modified later3197self._convert_strl: list[Hashable] = []3198if convert_strl is not None:3199self._convert_strl.extend(convert_strl)32003201super().__init__(3202fname,3203data,3204convert_dates,3205write_index,3206byteorder=byteorder,3207time_stamp=time_stamp,3208data_label=data_label,3209variable_labels=variable_labels,3210value_labels=value_labels,3211compression=compression,3212storage_options=storage_options,3213)3214self._map: dict[str, int] = {}3215self._strl_blob = b""32163217@staticmethod3218def _tag(val: str | bytes, tag: str) -> bytes:3219"""Surround val with <tag></tag>"""3220if isinstance(val, str):3221val = bytes(val, "utf-8")3222return bytes("<" + tag + ">", "utf-8") + val + bytes("</" + tag + ">", "utf-8")32233224def _update_map(self, tag: str) -> None:3225"""Update map location for tag with file position"""3226assert self.handles.handle is not None3227self._map[tag] = self.handles.handle.tell()32283229def _write_header(3230self,3231data_label: str | None = None,3232time_stamp: datetime.datetime | None = None,3233) -> None:3234"""Write the file header"""3235byteorder = self._byteorder3236self._write_bytes(bytes("<stata_dta>", "utf-8"))3237bio = BytesIO()3238# ds_format - 1173239bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release"))3240# byteorder3241bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder"))3242# number of vars, 2 bytes in 117 and 118, 4 byte in 1193243nvar_type = "H" if self._dta_version <= 118 else "I"3244bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K"))3245# 117 uses 4 bytes, 118 uses 83246nobs_size = "I" if self._dta_version == 117 else "Q"3247bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N"))3248# data label 81 bytes, char, null terminated3249label = data_label[:80] if data_label is not None else ""3250encoded_label = label.encode(self._encoding)3251label_size = "B" if self._dta_version == 117 else "H"3252label_len = struct.pack(byteorder + label_size, len(encoded_label))3253encoded_label = label_len + encoded_label3254bio.write(self._tag(encoded_label, "label"))3255# time stamp, 18 bytes, char, null terminated3256# format dd Mon yyyy hh:mm3257if time_stamp is None:3258time_stamp = datetime.datetime.now()3259elif not isinstance(time_stamp, datetime.datetime):3260raise ValueError("time_stamp should be datetime type")3261# Avoid locale-specific month conversion3262months = [3263"Jan",3264"Feb",3265"Mar",3266"Apr",3267"May",3268"Jun",3269"Jul",3270"Aug",3271"Sep",3272"Oct",3273"Nov",3274"Dec",3275]3276month_lookup = {i + 1: month for i, month in enumerate(months)}3277ts = (3278time_stamp.strftime("%d ")3279+ month_lookup[time_stamp.month]3280+ time_stamp.strftime(" %Y %H:%M")3281)3282# '\x11' added due to inspection of Stata file3283stata_ts = b"\x11" + bytes(ts, "utf-8")3284bio.write(self._tag(stata_ts, "timestamp"))3285self._write_bytes(self._tag(bio.getvalue(), "header"))32863287def _write_map(self) -> None:3288"""3289Called twice during file write. The first populates the values in3290the map with 0s. The second call writes the final map locations when3291all blocks have been written.3292"""3293if not self._map:3294self._map = {3295"stata_data": 0,3296"map": self.handles.handle.tell(),3297"variable_types": 0,3298"varnames": 0,3299"sortlist": 0,3300"formats": 0,3301"value_label_names": 0,3302"variable_labels": 0,3303"characteristics": 0,3304"data": 0,3305"strls": 0,3306"value_labels": 0,3307"stata_data_close": 0,3308"end-of-file": 0,3309}3310# Move to start of map3311self.handles.handle.seek(self._map["map"])3312bio = BytesIO()3313for val in self._map.values():3314bio.write(struct.pack(self._byteorder + "Q", val))3315self._write_bytes(self._tag(bio.getvalue(), "map"))33163317def _write_variable_types(self) -> None:3318self._update_map("variable_types")3319bio = BytesIO()3320for typ in self.typlist:3321bio.write(struct.pack(self._byteorder + "H", typ))3322self._write_bytes(self._tag(bio.getvalue(), "variable_types"))33233324def _write_varnames(self) -> None:3325self._update_map("varnames")3326bio = BytesIO()3327# 118 scales by 4 to accommodate utf-8 data worst case encoding3328vn_len = 32 if self._dta_version == 117 else 1283329for name in self.varlist:3330name = self._null_terminate_str(name)3331name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1)3332bio.write(name)3333self._write_bytes(self._tag(bio.getvalue(), "varnames"))33343335def _write_sortlist(self) -> None:3336self._update_map("sortlist")3337sort_size = 2 if self._dta_version < 119 else 43338self._write_bytes(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist"))33393340def _write_formats(self) -> None:3341self._update_map("formats")3342bio = BytesIO()3343fmt_len = 49 if self._dta_version == 117 else 573344for fmt in self.fmtlist:3345bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len))3346self._write_bytes(self._tag(bio.getvalue(), "formats"))33473348def _write_value_label_names(self) -> None:3349self._update_map("value_label_names")3350bio = BytesIO()3351# 118 scales by 4 to accommodate utf-8 data worst case encoding3352vl_len = 32 if self._dta_version == 117 else 1283353for i in range(self.nvar):3354# Use variable name when categorical3355name = "" # default name3356if self._has_value_labels[i]:3357name = self.varlist[i]3358name = self._null_terminate_str(name)3359encoded_name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1)3360bio.write(encoded_name)3361self._write_bytes(self._tag(bio.getvalue(), "value_label_names"))33623363def _write_variable_labels(self) -> None:3364# Missing labels are 80 blank characters plus null termination3365self._update_map("variable_labels")3366bio = BytesIO()3367# 118 scales by 4 to accommodate utf-8 data worst case encoding3368vl_len = 80 if self._dta_version == 117 else 3203369blank = _pad_bytes_new("", vl_len + 1)33703371if self._variable_labels is None:3372for _ in range(self.nvar):3373bio.write(blank)3374self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))3375return33763377for col in self.data:3378if col in self._variable_labels:3379label = self._variable_labels[col]3380if len(label) > 80:3381raise ValueError("Variable labels must be 80 characters or fewer")3382try:3383encoded = label.encode(self._encoding)3384except UnicodeEncodeError as err:3385raise ValueError(3386"Variable labels must contain only characters that "3387f"can be encoded in {self._encoding}"3388) from err33893390bio.write(_pad_bytes_new(encoded, vl_len + 1))3391else:3392bio.write(blank)3393self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))33943395def _write_characteristics(self) -> None:3396self._update_map("characteristics")3397self._write_bytes(self._tag(b"", "characteristics"))33983399def _write_data(self, records) -> None:3400self._update_map("data")3401self._write_bytes(b"<data>")3402self._write_bytes(records.tobytes())3403self._write_bytes(b"</data>")34043405def _write_strls(self) -> None:3406self._update_map("strls")3407self._write_bytes(self._tag(self._strl_blob, "strls"))34083409def _write_expansion_fields(self) -> None:3410"""No-op in dta 117+"""3411pass34123413def _write_value_labels(self) -> None:3414self._update_map("value_labels")3415bio = BytesIO()3416for vl in self._value_labels:3417lab = vl.generate_value_label(self._byteorder)3418lab = self._tag(lab, "lbl")3419bio.write(lab)3420self._write_bytes(self._tag(bio.getvalue(), "value_labels"))34213422def _write_file_close_tag(self) -> None:3423self._update_map("stata_data_close")3424self._write_bytes(bytes("</stata_dta>", "utf-8"))3425self._update_map("end-of-file")34263427def _update_strl_names(self) -> None:3428"""3429Update column names for conversion to strl if they might have been3430changed to comply with Stata naming rules3431"""3432# Update convert_strl if names changed3433for orig, new in self._converted_names.items():3434if orig in self._convert_strl:3435idx = self._convert_strl.index(orig)3436self._convert_strl[idx] = new34373438def _convert_strls(self, data: DataFrame) -> DataFrame:3439"""3440Convert columns to StrLs if either very large or in the3441convert_strl variable3442"""3443convert_cols = [3444col3445for i, col in enumerate(data)3446if self.typlist[i] == 32768 or col in self._convert_strl3447]34483449if convert_cols:3450ssw = StataStrLWriter(data, convert_cols, version=self._dta_version)3451tab, new_data = ssw.generate_table()3452data = new_data3453self._strl_blob = ssw.generate_blob(tab)3454return data34553456def _set_formats_and_types(self, dtypes: Series) -> None:3457self.typlist = []3458self.fmtlist = []3459for col, dtype in dtypes.items():3460force_strl = col in self._convert_strl3461fmt = _dtype_to_default_stata_fmt(3462dtype,3463self.data[col],3464dta_version=self._dta_version,3465force_strl=force_strl,3466)3467self.fmtlist.append(fmt)3468self.typlist.append(3469_dtype_to_stata_type_117(dtype, self.data[col], force_strl)3470)347134723473class StataWriterUTF8(StataWriter117):3474"""3475Stata binary dta file writing in Stata 15 (118) and 16 (119) formats34763477DTA 118 and 119 format files support unicode string data (both fixed3478and strL) format. Unicode is also supported in value labels, variable3479labels and the dataset label. Format 119 is automatically used if the3480file contains more than 32,767 variables.34813482.. versionadded:: 1.0.034833484Parameters3485----------3486fname : path (string), buffer or path object3487string, path object (pathlib.Path or py._path.local.LocalPath) or3488object implementing a binary write() functions. If using a buffer3489then the buffer will not be automatically closed after the file3490is written.3491data : DataFrame3492Input to save3493convert_dates : dict, default None3494Dictionary mapping columns containing datetime types to stata internal3495format to use when writing the dates. Options are 'tc', 'td', 'tm',3496'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.3497Datetime columns that do not have a conversion type specified will be3498converted to 'tc'. Raises NotImplementedError if a datetime column has3499timezone information3500write_index : bool, default True3501Write the index to Stata dataset.3502byteorder : str, default None3503Can be ">", "<", "little", or "big". default is `sys.byteorder`3504time_stamp : datetime, default None3505A datetime to use as file creation date. Default is the current time3506data_label : str, default None3507A label for the data set. Must be 80 characters or smaller.3508variable_labels : dict, default None3509Dictionary containing columns as keys and variable labels as values.3510Each label must be 80 characters or smaller.3511convert_strl : list, default None3512List of columns names to convert to Stata StrL format. Columns with3513more than 2045 characters are automatically written as StrL.3514Smaller columns can be converted by including the column name. Using3515StrLs can reduce output file size when strings are longer than 83516characters, and either frequently repeated or sparse.3517version : int, default None3518The dta version to use. By default, uses the size of data to determine3519the version. 118 is used if data.shape[1] <= 32767, and 119 is used3520for storing larger DataFrames.3521{compression_options}35223523.. versionadded:: 1.1.035243525.. versionchanged:: 1.4.0 Zstandard support.35263527value_labels : dict of dicts3528Dictionary containing columns as keys and dictionaries of column value3529to labels as values. The combined length of all labels for a single3530variable must be 32,000 characters or smaller.35313532.. versionadded:: 1.4.035333534Returns3535-------3536StataWriterUTF83537The instance has a write_file method, which will write the file to the3538given `fname`.35393540Raises3541------3542NotImplementedError3543* If datetimes contain timezone information3544ValueError3545* Columns listed in convert_dates are neither datetime64[ns]3546or datetime.datetime3547* Column dtype is not representable in Stata3548* Column listed in convert_dates is not in DataFrame3549* Categorical label contains more than 32,000 characters35503551Examples3552--------3553Using Unicode data and column names35543555>>> from pandas.io.stata import StataWriterUTF83556>>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])3557>>> writer = StataWriterUTF8('./data_file.dta', data)3558>>> writer.write_file()35593560Directly write a zip file3561>>> compression = {"method": "zip", "archive_name": "data_file.dta"}3562>>> writer = StataWriterUTF8('./data_file.zip', data, compression=compression)3563>>> writer.write_file()35643565Or with long strings stored in strl format35663567>>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],3568... columns=['strls'])3569>>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data,3570... convert_strl=['strls'])3571>>> writer.write_file()3572"""35733574_encoding = "utf-8"35753576def __init__(3577self,3578fname: FilePath | WriteBuffer[bytes],3579data: DataFrame,3580convert_dates: dict[Hashable, str] | None = None,3581write_index: bool = True,3582byteorder: str | None = None,3583time_stamp: datetime.datetime | None = None,3584data_label: str | None = None,3585variable_labels: dict[Hashable, str] | None = None,3586convert_strl: Sequence[Hashable] | None = None,3587version: int | None = None,3588compression: CompressionOptions = "infer",3589storage_options: StorageOptions = None,3590*,3591value_labels: dict[Hashable, dict[float | int, str]] | None = None,3592):3593if version is None:3594version = 118 if data.shape[1] <= 32767 else 1193595elif version not in (118, 119):3596raise ValueError("version must be either 118 or 119.")3597elif version == 118 and data.shape[1] > 32767:3598raise ValueError(3599"You must use version 119 for data sets containing more than"3600"32,767 variables"3601)36023603super().__init__(3604fname,3605data,3606convert_dates=convert_dates,3607write_index=write_index,3608byteorder=byteorder,3609time_stamp=time_stamp,3610data_label=data_label,3611variable_labels=variable_labels,3612value_labels=value_labels,3613convert_strl=convert_strl,3614compression=compression,3615storage_options=storage_options,3616)3617# Override version set in StataWriter117 init3618self._dta_version = version36193620def _validate_variable_name(self, name: str) -> str:3621"""3622Validate variable names for Stata export.36233624Parameters3625----------3626name : str3627Variable name36283629Returns3630-------3631str3632The validated name with invalid characters replaced with3633underscores.36343635Notes3636-----3637Stata 118+ support most unicode characters. The only limitation is in3638the ascii range where the characters supported are a-z, A-Z, 0-9 and _.3639"""3640# High code points appear to be acceptable3641for c in name:3642if (3643ord(c) < 1283644and (c < "A" or c > "Z")3645and (c < "a" or c > "z")3646and (c < "0" or c > "9")3647and c != "_"3648) or 128 <= ord(c) < 256:3649name = name.replace(c, "_")36503651return name365236533654