diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..4f6ada08 --- /dev/null +++ b/404.html @@ -0,0 +1,1149 @@ + + + +
+ + + + + + + + + + + + + + +kwargs
not updated when pipeline is a Pipen
object in utils.load_pipeline()
utils.load_pipeline()
utils.load_pipeline()
pipen.run()
as a function to run a pipelineproc
for job plugin APIsabstractproperty
decorator from CLIPlugin
classdirsig
pipen.plugin_context
due to simplug upgradeargs
to argv
for utils.is_loading_pipeline()
utils.is_loading_pipeline()
--help
) to utils.is_loading_pipeline
to check arguments in sys.argv
varname
to 0.13varname
to 0.13envs_depth
to procs to control the depth of envs to be inherited by subclasseson_job_polling
on_job_running
to on_job_started
and add on_job_polling
utils.load_pipeline()
sys.argv[1:]
by default when cli_args
is None
in utils.load_pipeline()
on_proc_script_computed
hookon_proc_init
to on_proc_create
on_proc_init
hook back but after the process initialized insteadl of beforeutils.mark
and get_marked
when __meta__
is None
utils.mark
and get_marked
now work with ProcGroup
and other classes_results
to -output
--list
for pipen profile
to list the names of available profileson_job_cached
hookutils.mark
and get_marked
to mark a process
+ Unlike plugin_opts, template_opts or envs, these marks are not inherited in subclassesProc.__meta__
that will not be inherited when subclassingprocgroup
in Proc.__meta__
Proc.__doc__
when subclassing~/.pipen.toml
in defaults.✨ Add ProcGroup
to manage groups of processes.
from pipen import Proc, ProcGroup
+
+class MyGroup(ProcGroup):
+
+ @ProcGroup.add_proc
+ def my_proc(self):
+ class MyProc(Proc):
+ ...
+ return MyProc
+
+ @ProcGroup.add_proc
+ def my_proc2(self):
+ class MyProc2(Proc):
+ requires = self.my_proc
+ ...
+ return MyProc2
+
+pg = MyGroup()
+# Run as a pipeline
+pg.as_pipen().set_data(...).run()
+
+# Integrate into a pipeline
+<proc_of_a_pipeline>.requires = pg.my_proc2
+
argx
to 0.2.2parse_args()
to cli pluginsargx
to 0.2nexts
being inherited for Proc
subclassesrtoml
instead of toml
(see https://github.com/pwwang/toml-bench)FutureWarning
in Proc._compute_input()
__doc__
for Proc.from_proc()
job.signature.toml
to force cache a jobProc.__init_subclass__()
envs
, plugin_opts
and scheduler_opts
while subclassing processes.on_proc_input_computed
requires
to be set by __setattr__()
pipen._build_proc_relationships()
public and don't rebuild the relationson_proc_init
to modify process workdir
nexts
affected by parent nexts
assignment when parent in __bases__
on_proc_init()
hook to enables plugins to modify the default attributes of processesProc.args
to Proc.envs
set_starts()
and set_data()
to set start processes of a pipeline.exec_cmd
hook of cli plugin to exec_command
It's now fully documented. See documentations.
+Provide some function for creating and modifying channels (dataframes)
collapse_files
(
data
, col
)
+(DataFrame)
+— Collapse a Channel according to the files in expand_dir
(
data
, col
, pattern
, ftype
, sortby
, reverse
)
+(DataFrame)
+— Expand a Channel according to the files in pipen.channel.
Channel
(
data=None
, index=None
, columns=None
, dtype=None
, copy=None
)
A DataFrame wrapper with creators
data
+(optional)
+— Dict can contain Series, arrays, constants, dataclass or list-like objects. Ifdata is a dict, column order follows insertion-order. If a dict contains Series
+which have an index defined, it is aligned by its index. This alignment also
+occurs if data is a Series or a DataFrame itself. Alignment is done on
+Series/DataFrame inputs.index
+(Axes | None, optional)
+— Index to use for resulting frame. Will default to RangeIndex ifno indexing information part of input data and no index provided.
+columns
+(Axes | None, optional)
+— Column labels to use for resulting frame when data does not have them,defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
+will perform column selection instead.
+dtype
+(Dtype | None, optional)
+— Data type to force. Only a single dtype is allowed. If None, infer.copy
+(bool | none, optional)
+— Copy data from inputs.For dict data, the default of None behaves like copy=True
. For DataFrame
+or 2d ndarray input, the default of None behaves like copy=False
.
+If data is a dict containing one or more Series (possibly of different dtypes),
+copy=False
will ensure that these inputs are not copied.T
+
+— The transpose of the DataFrame.</>at
+(_AtIndexer)
+— Access a single value for a row/column label pair.loc
, in that both provide label-based lookups. Use
+at
if you only need to get or set a single value in a DataFrame
+or Series.
+</>attrs
+(dict[Hashable, Any])
+— Dictionary of global attributes of this dataset.axes
+(list)
+— Return a list representing the axes of the DataFrame.dtypes
+
+— Return the dtypes in the DataFrame.object
dtype. See
+:ref:the User Guide <basics.dtypes>
for more.
+</>empty
+
+— Indicator whether Series/DataFrame is empty.flags
+(Flags)
+— Get the properties associated with this pandas object.Flags.allows_duplicate_labels
iat
+(_iAtIndexer)
+— Access a single value for a row/column pair by integer position.iloc
, in that both provide integer-based lookups. Use
+iat
if you only need to get or set a single value in a DataFrame
+or Series.
+</>iloc
+(_iLocIndexer)
+— Purely integer-location based indexing for selection by position..iloc[]
is primarily integer position based (from 0
to
+length-1
of the axis), but may also be used with a boolean
+array.5
.[4, 3, 0]
.1:7
.callable
function with one argument (the calling Series or
+ DataFrame) and that returns valid output for indexing (one of the above).
+ This is useful in method chains, when you don't have a reference to the
+ calling object, but would like to base your selection on
+ some value.(0, 1)
..iloc
will raise IndexError
if a requested indexer is
+out-of-bounds, except slice indexers which allow out-of-bounds
+indexing (this conforms with python/numpy slice semantics).Selection by Position <indexing.integer>
.
+</>loc
+(_LocIndexer)
+— Access a group of rows and columns by label(s) or a boolean array..loc[]
is primarily label based, but may also be used with a
+boolean array.5
or 'a'
, (note that 5
is
+ interpreted as a label of the index, and never as an
+ integer position along the index).['a', 'b', 'c']
.'a':'f'
.[True, False, True]
.callable
function with one argument (the calling Series or
+ DataFrame) and that returns valid output for indexing (one of the above)Selection by Label <indexing.label>
.
+</>ndim
+(int)
+— Return an int representing the number of axes / array dimensions.shape
+(tuple)
+— Return a tuple representing the dimensionality of the DataFrame.</>size
+(int)
+— Return an int representing the number of elements in this object.style
+(Styler)
+— Returns a Styler object.values
+
+— Return a Numpy representation of the DataFrame.DataFrame.to_numpy
instead.__add__
(
other
)
+(DataFrame)
+— Get Addition of DataFrame and other, column-wise.</>__arrow_c_stream__
(
requested_schema
)
+(PyCapsule)
+— Export the pandas DataFrame as an Arrow C stream PyCapsule.</>__contains__
(
key
)
+(bool)
+— True if the key is in the info axis</>__dataframe__
(
nan_as_null
, allow_copy
)
+(DataFrame interchange object)
+— Return the dataframe interchange object implementing the interchange protocol.</>__dataframe_consortium_standard__
(
api_version
)
+(Any)
+— Provide entry point to the Consortium DataFrame Standard API.</>__delitem__
(
key
)
+
+— Delete item</>__dir__
(
)
+(list)
+— Provide method name lookup and completion.</>__finalize__
(
other
, method
, **kwargs
)
+
+— Propagate metadata from other to self.</>__getattr__
(
name
)
+
+— After regular attribute access, try looking up the nameThis allows simpler access to columns for interactive use.
+</>__iter__
(
)
+(iterator)
+— Iterate over info axis.</>__len__
(
)
+(int)
+— Returns length of info axis, but here we use the index.</>__matmul__
(
other
)
+(pandas.core.frame.dataframe | pandas.core.series.series)
+— Matrix multiplication using binary @
operator.</>__repr__
(
)
+(str)
+— Return a string representation for a particular DataFrame.</>__rmatmul__
(
other
)
+(DataFrame)
+— Matrix multiplication using binary @
operator.</>__setattr__
(
name
, value
)
+
+— After regular attribute access, try setting the nameThis allows simpler access to columns for interactive use.
+</>__sizeof__
(
)
+(int)
+— Generates the total memory usage for an object that returnseither a value or Series of values
+</>abs
(
)
+(abs)
+— Return a Series/DataFrame with absolute numeric value of each element.</>add
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Addition of dataframe and other, element-wise (binary operator add
).</>add_prefix
(
prefix
, axis
)
+(Series or DataFrame)
+— Prefix labels with string prefix
.</>add_suffix
(
suffix
, axis
)
+(Series or DataFrame)
+— Suffix labels with string suffix
.</>aggregate
(
func
, axis
, *args
, **kwargs
)
+(scalar, Series or DataFrame)
+— Aggregate using one or more operations over the specified axis.</>align
(
other
, join
, axis
, level
, copy
, fill_value
, method
, limit
, fill_axis
, broadcast_axis
)
+(tuple of (Series/DataFrame, type of other))
+— Align two objects on their axes with the specified join method.</>all
(
axis
, bool_only
, skipna
, **kwargs
)
+(Series or DataFrame)
+— Return whether all elements are True, potentially over an axis.</>any
(
axis
, bool_only
, skipna
, **kwargs
)
+(Series or DataFrame)
+— Return whether any element is True, potentially over an axis.</>apply
(
func
, axis
, raw
, result_type
, args
, by_row
, engine
, engine_kwargs
, **kwargs
)
+(Series or DataFrame)
+— Apply a function along an axis of the DataFrame.</>applymap
(
func
, na_action
, **kwargs
)
+(DataFrame)
+— Apply a function to a Dataframe elementwise.</>asfreq
(
freq
, method
, how
, normalize
, fill_value
)
+(Series/DataFrame)
+— Convert time series to specified frequency.</>asof
(
where
, subset
)
+(scalar, Series, or DataFrame)
+— Return the last row(s) without any NaNs before where
.</>assign
(
**kwargs
)
+(DataFrame)
+— Assign new columns to a DataFrame.</>astype
(
dtype
, copy
, errors
)
+(same type as caller)
+— Cast a pandas object to a specified dtype dtype
.</>at_time
(
time
, asof
, axis
)
+(Series or DataFrame)
+— Select values at particular time of day (e.g., 9:30AM).</>backfill
(
axis
, inplace
, limit
, downcast
)
+(Series/DataFrame or None)
+— Fill NA/NaN values by using the next valid observation to fill the gap.</>between_time
(
start_time
, end_time
, inclusive
, axis
)
+(Series or DataFrame)
+— Select values between particular times of the day (e.g., 9:00-9:30 AM).</>bfill
(
axis
, inplace
, limit
, limit_area
, downcast
)
+(Series/DataFrame or None)
+— Fill NA/NaN values by using the next valid observation to fill the gap.</>bool
(
)
+(bool)
+— Return the bool of a single element Series or DataFrame.</>clip
(
lower
, upper
, axis
, inplace
, **kwargs
)
+(Series or DataFrame or None)
+— Trim values at input threshold(s).</>combine
(
other
, func
, fill_value
, overwrite
)
+(DataFrame)
+— Perform column-wise combine with another DataFrame.</>combine_first
(
other
)
+(DataFrame)
+— Update null elements with value in the same location in other
.</>compare
(
other
, align_axis
, keep_shape
, keep_equal
, result_names
)
+(DataFrame)
+— Compare to another DataFrame and show the differences.</>convert_dtypes
(
infer_objects
, convert_string
, convert_integer
, convert_boolean
, convert_floating
, dtype_backend
)
+(Series or DataFrame)
+— Convert columns to the best possible dtypes using dtypes supporting pd.NA
.</>copy
(
deep
)
+(Series or DataFrame)
+— Make a copy of this object's indices and data.</>corr
(
method
, min_periods
, numeric_only
)
+(DataFrame)
+— Compute pairwise correlation of columns, excluding NA/null values.</>corrwith
(
other
, axis
, drop
, method
, numeric_only
)
+(Series)
+— Compute pairwise correlation.</>count
(
axis
, numeric_only
)
+(Series)
+— Count non-NA cells for each column or row.</>cov
(
min_periods
, ddof
, numeric_only
)
+(DataFrame)
+— Compute pairwise covariance of columns, excluding NA/null values.</>create
(
value
)
+(DataFrame)
+— Create a channel from a list.</>cummax
(
axis
, skipna
, *args
, **kwargs
)
+(Series or DataFrame)
+— Return cumulative maximum over a DataFrame or Series axis.</>cummin
(
axis
, skipna
, *args
, **kwargs
)
+(Series or DataFrame)
+— Return cumulative minimum over a DataFrame or Series axis.</>cumprod
(
axis
, skipna
, *args
, **kwargs
)
+(Series or DataFrame)
+— Return cumulative product over a DataFrame or Series axis.</>cumsum
(
axis
, skipna
, *args
, **kwargs
)
+(Series or DataFrame)
+— Return cumulative sum over a DataFrame or Series axis.</>describe
(
percentiles
, include
, exclude
)
+(Series or DataFrame)
+— Generate descriptive statistics.</>diff
(
periods
, axis
)
+(DataFrame)
+— First discrete difference of element.</>dot
(
other
)
+(Series or DataFrame)
+— Compute the matrix multiplication between the DataFrame and other.</>drop
(
labels
, axis
, index
, columns
, level
, inplace
, errors
)
+(DataFrame or None)
+— Drop specified labels from rows or columns.</>drop_duplicates
(
subset
, keep
, inplace
, ignore_index
)
+(DataFrame or None)
+— Return DataFrame with duplicate rows removed.</>droplevel
(
level
, axis
)
+(Series/DataFrame)
+— Return Series/DataFrame with requested index / column level(s) removed.</>dropna
(
axis
, how
, thresh
, subset
, inplace
, ignore_index
)
+(DataFrame or None)
+— Remove missing values.</>duplicated
(
subset
, keep
)
+(Series)
+— Return boolean Series denoting duplicate rows.</>eq
(
other
, axis
, level
)
+(DataFrame of bool)
+— Get Equal to of dataframe and other, element-wise (binary operator eq
).</>equals
(
other
)
+(bool)
+— Test whether two objects contain the same elements.</>eval
(
expr
, inplace
, **kwargs
)
+(ndarray, scalar, pandas object, or None)
+— Evaluate a string describing operations on DataFrame columns.</>ewm
(
com
, span
, halflife
, alpha
, min_periods
, adjust
, ignore_na
, axis
, times
, method
)
+(pandas.api.typing.ExponentialMovingWindow)
+— Provide exponentially weighted (EW) calculations.</>expanding
(
min_periods
, axis
, method
)
+(pandas.api.typing.Expanding)
+— Provide expanding window calculations.</>explode
(
column
, ignore_index
)
+(DataFrame)
+— Transform each element of a list-like to a row, replicating index values.</>ffill
(
axis
, inplace
, limit
, limit_area
, downcast
)
+(Series/DataFrame or None)
+— Fill NA/NaN values by propagating the last valid observation to next valid.</>fillna
(
value
, method
, axis
, inplace
, limit
, downcast
)
+(Series/DataFrame or None)
+— Fill NA/NaN values using the specified method.</>filter
(
items
, like
, regex
, axis
)
+(same type as input object)
+— Subset the dataframe rows or columns according to the specified index labels.</>first
(
offset
)
+(Series or DataFrame)
+— Select initial periods of time series data based on a date offset.</>first_valid_index
(
)
+(type of index)
+— Return index for first non-NA value or None, if no non-NA value is found.</>floordiv
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Integer division of dataframe and other, element-wise (binary operator floordiv
).</>from_csv
(
*args
, **kwargs
)
+
+— Create a channel from a csv file</>from_dict
(
data
, orient
, dtype
, columns
)
+(DataFrame)
+— Construct DataFrame from dict of array-like or dicts.</>from_excel
(
*args
, **kwargs
)
+
+— Create a channel from an excel file.</>from_glob
(
pattern
, ftype
, sortby
, reverse
)
+(DataFrame)
+— Create a channel with a glob pattern</>from_pairs
(
pattern
, ftype
, sortby
, reverse
)
+(DataFrame)
+— Create a width=2 channel with a glob pattern</>from_records
(
data
, index
, exclude
, columns
, coerce_float
, nrows
)
+(DataFrame)
+— Convert structured or record ndarray to DataFrame.</>from_table
(
*args
, **kwargs
)
+
+— Create a channel from a table file.</>ge
(
other
, axis
, level
)
+(DataFrame of bool)
+— Get Greater than or equal to of dataframe and other, element-wise (binary operator ge
).</>get
(
key
, default
)
+(same type as items contained in object)
+— Get item from object for given key (ex: DataFrame column).</>groupby
(
by
, axis
, level
, as_index
, sort
, group_keys
, observed
, dropna
)
+(pandas.api.typing.DataFrameGroupBy)
+— Group DataFrame using a mapper or by a Series of columns.</>gt
(
other
, axis
, level
)
+(DataFrame of bool)
+— Get Greater than of dataframe and other, element-wise (binary operator gt
).</>head
(
n
)
+(same type as caller)
+— Return the first n
rows.</>idxmax
(
axis
, skipna
, numeric_only
)
+(Series)
+— Return index of first occurrence of maximum over requested axis.</>idxmin
(
axis
, skipna
, numeric_only
)
+(Series)
+— Return index of first occurrence of minimum over requested axis.</>infer_objects
(
copy
)
+(same type as input object)
+— Attempt to infer better dtypes for object columns.</>info
(
verbose
, buf
, max_cols
, memory_usage
, show_counts
)
+(None)
+— Print a concise summary of a DataFrame.</>insert
(
loc
, column
, value
, allow_duplicates
)
+
+— Insert column into DataFrame at specified location.</>interpolate
(
method
, axis
, limit
, inplace
, limit_direction
, limit_area
, downcast
, **kwargs
)
+(Series or DataFrame or None)
+— Fill NaN values using an interpolation method.</>isetitem
(
loc
, value
)
+
+— Set the given value in the column with position loc
.</>isin
(
values
)
+(DataFrame)
+— Whether each element in the DataFrame is contained in values.</>isna
(
)
+(DataFrame)
+— Detect missing values.</>isnull
(
)
+(DataFrame)
+— DataFrame.isnull is an alias for DataFrame.isna.</>items
(
)
+(label : object)
+— Iterate over (column name, Series) pairs.</>iterrows
(
)
+(index : label or tuple of label)
+— Iterate over DataFrame rows as (index, Series) pairs.</>itertuples
(
index
, name
)
+(iterator)
+— Iterate over DataFrame rows as namedtuples.</>join
(
other
, on
, how
, lsuffix
, rsuffix
, sort
, validate
)
+(DataFrame)
+— Join columns of another DataFrame.</>keys
(
)
+(Index)
+— Get the 'info axis' (see Indexing for more).</>kurt
(
axis
, skipna
, numeric_only
, **kwargs
)
+(Series or scalar)
+— Return unbiased kurtosis over requested axis.</>last
(
offset
)
+(Series or DataFrame)
+— Select final periods of time series data based on a date offset.</>last_valid_index
(
)
+(type of index)
+— Return index for last non-NA value or None, if no non-NA value is found.</>le
(
other
, axis
, level
)
+(DataFrame of bool)
+— Get Less than or equal to of dataframe and other, element-wise (binary operator le
).</>lt
(
other
, axis
, level
)
+(DataFrame of bool)
+— Get Less than of dataframe and other, element-wise (binary operator lt
).</>map
(
func
, na_action
, **kwargs
)
+(DataFrame)
+— Apply a function to a Dataframe elementwise.</>mask
(
cond
, other
, inplace
, axis
, level
)
+(Same type as caller or None if ``inplace=True``.)
+— Replace values where the condition is True.</>max
(
axis
, skipna
, numeric_only
, **kwargs
)
+(Series or scalar)
+— Return the maximum of the values over the requested axis.</>mean
(
axis
, skipna
, numeric_only
, **kwargs
)
+(Series or scalar)
+— Return the mean of the values over the requested axis.</>median
(
axis
, skipna
, numeric_only
, **kwargs
)
+(Series or scalar)
+— Return the median of the values over the requested axis.</>melt
(
id_vars
, value_vars
, var_name
, value_name
, col_level
, ignore_index
)
+(DataFrame)
+— Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.</>memory_usage
(
index
, deep
)
+(Series)
+— Return the memory usage of each column in bytes.</>merge
(
right
, how
, on
, left_on
, right_on
, left_index
, right_index
, sort
, suffixes
, copy
, indicator
, validate
)
+(DataFrame)
+— Merge DataFrame or named Series objects with a database-style join.</>min
(
axis
, skipna
, numeric_only
, **kwargs
)
+(Series or scalar)
+— Return the minimum of the values over the requested axis.</>mod
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Modulo of dataframe and other, element-wise (binary operator mod
).</>mode
(
axis
, numeric_only
, dropna
)
+(DataFrame)
+— Get the mode(s) of each element along the selected axis.</>mul
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Multiplication of dataframe and other, element-wise (binary operator mul
).</>ne
(
other
, axis
, level
)
+(DataFrame of bool)
+— Get Not equal to of dataframe and other, element-wise (binary operator ne
).</>nlargest
(
n
, columns
, keep
)
+(DataFrame)
+— Return the first n
rows ordered by columns
in descending order.</>notna
(
)
+(DataFrame)
+— Detect existing (non-missing) values.</>notnull
(
)
+(DataFrame)
+— DataFrame.notnull is an alias for DataFrame.notna.</>nsmallest
(
n
, columns
, keep
)
+(DataFrame)
+— Return the first n
rows ordered by columns
in ascending order.</>nunique
(
axis
, dropna
)
+(Series)
+— Count number of distinct elements in specified axis.</>pad
(
axis
, inplace
, limit
, downcast
)
+(Series/DataFrame or None)
+— Fill NA/NaN values by propagating the last valid observation to next valid.</>pct_change
(
periods
, fill_method
, limit
, freq
, **kwargs
)
+(Series or DataFrame)
+— Fractional change between the current and a prior element.</>pipe
(
func
, *args
, **kwargs
)
+(the return type of ``func``.)
+— Apply chainable functions that expect Series or DataFrames.</>pivot
(
columns
, index
, values
)
+(DataFrame)
+— Return reshaped DataFrame organized by given index / column values.</>pivot_table
(
values
, index
, columns
, aggfunc
, fill_value
, margins
, dropna
, margins_name
, observed
, sort
)
+(DataFrame)
+— Create a spreadsheet-style pivot table as a DataFrame.</>pop
(
item
)
+(Series)
+— Return item and drop from frame. Raise KeyError if not found.</>pow
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Exponential power of dataframe and other, element-wise (binary operator pow
).</>prod
(
axis
, skipna
, numeric_only
, min_count
, **kwargs
)
+(Series or scalar)
+— Return the product of the values over the requested axis.</>quantile
(
q
, axis
, numeric_only
, interpolation
, method
)
+(Series or DataFrame)
+— Return values at the given quantile over requested axis.</>query
(
expr
, inplace
, **kwargs
)
+(DataFrame or None)
+— Query the columns of a DataFrame with a boolean expression.</>radd
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Addition of dataframe and other, element-wise (binary operator radd
).</>rank
(
axis
, method
, numeric_only
, na_option
, ascending
, pct
)
+(same type as caller)
+— Compute numerical data ranks (1 through n) along axis.</>reindex
(
labels
, index
, columns
, axis
, method
, copy
, level
, fill_value
, limit
, tolerance
)
+(DataFrame with changed index.)
+— Conform DataFrame to new index with optional filling logic.</>reindex_like
(
other
, method
, copy
, limit
, tolerance
)
+(Series or DataFrame)
+— Return an object with matching indices as other object.</>rename
(
mapper
, index
, columns
, axis
, copy
, inplace
, level
, errors
)
+(DataFrame or None)
+— Rename columns or index labels.</>rename_axis
(
mapper
, index
, columns
, axis
, copy
, inplace
)
+(Series, DataFrame, or None)
+— Set the name of the axis for the index or columns.</>reorder_levels
(
order
, axis
)
+(DataFrame)
+— Rearrange index levels using input order. May not drop or duplicate levels.</>replace
(
to_replace
, value
, inplace
, limit
, regex
, method
)
+(Series/DataFrame)
+— Replace values given in to_replace
with value
.</>resample
(
rule
, axis
, closed
, label
, convention
, kind
, on
, level
, origin
, offset
, group_keys
)
+(pandas.api.typing.Resampler)
+— Resample time-series data.</>reset_index
(
level
, drop
, inplace
, col_level
, col_fill
, allow_duplicates
, names
)
+(DataFrame or None)
+— Reset the index, or a level of it.</>rfloordiv
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Integer division of dataframe and other, element-wise (binary operator rfloordiv
).</>rmod
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Modulo of dataframe and other, element-wise (binary operator rmod
).</>rmul
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Multiplication of dataframe and other, element-wise (binary operator rmul
).</>rolling
(
window
, min_periods
, center
, win_type
, on
, axis
, closed
, step
, method
)
+(pandas.api.typing.Window or pandas.api.typing.Rolling)
+— Provide rolling window calculations.</>round
(
decimals
, *args
, **kwargs
)
+(DataFrame)
+— Round a DataFrame to a variable number of decimal places.</>rpow
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Exponential power of dataframe and other, element-wise (binary operator rpow
).</>rsub
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Subtraction of dataframe and other, element-wise (binary operator rsub
).</>rtruediv
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Floating division of dataframe and other, element-wise (binary operator rtruediv
).</>sample
(
n
, frac
, replace
, weights
, random_state
, axis
, ignore_index
)
+(Series or DataFrame)
+— Return a random sample of items from an axis of object.</>select_dtypes
(
include
, exclude
)
+(DataFrame)
+— Return a subset of the DataFrame's columns based on the column dtypes.</>sem
(
axis
, skipna
, ddof
, numeric_only
, **kwargs
)
+(Series or DataFrame (if level specified))
+— Return unbiased standard error of the mean over requested axis.</>set_axis
(
labels
, axis
, copy
)
+(DataFrame)
+— Assign desired index to given axis.</>set_flags
(
copy
, allows_duplicate_labels
)
+(Series or DataFrame)
+— Return a new object with updated flags.</>set_index
(
keys
, drop
, append
, inplace
, verify_integrity
)
+(DataFrame or None)
+— Set the DataFrame index using existing columns.</>shift
(
periods
, freq
, axis
, fill_value
, suffix
)
+(DataFrame)
+— Shift index by desired number of periods with an optional time freq
.</>skew
(
axis
, skipna
, numeric_only
, **kwargs
)
+(Series or scalar)
+— Return unbiased skew over requested axis.</>sort_index
(
axis
, level
, ascending
, inplace
, kind
, na_position
, sort_remaining
, ignore_index
, key
)
+(DataFrame or None)
+— Sort object by labels (along an axis).</>sort_values
(
by
, axis
, ascending
, inplace
, kind
, na_position
, ignore_index
, key
)
+(DataFrame or None)
+— Sort by the values along either axis.</>squeeze
(
axis
)
+(DataFrame, Series, or scalar)
+— Squeeze 1 dimensional axis objects into scalars.</>stack
(
level
, dropna
, sort
, future_stack
)
+(DataFrame or Series)
+— Stack the prescribed level(s) from columns to index.</>std
(
axis
, skipna
, ddof
, numeric_only
, **kwargs
)
+(Series or DataFrame (if level specified))
+— Return sample standard deviation over requested axis.</>sub
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Subtraction of dataframe and other, element-wise (binary operator sub
).</>sum
(
axis
, skipna
, numeric_only
, min_count
, **kwargs
)
+(Series or scalar)
+— Return the sum of the values over the requested axis.</>swapaxes
(
axis1
, axis2
, copy
)
+(same as input)
+— Interchange axes and swap values axes appropriately.</>swaplevel
(
i
, j
, axis
)
+(DataFrame)
+— Swap levels i and j in a :class:MultiIndex
.</>tail
(
n
)
+(type of caller)
+— Return the last n
rows.</>take
(
indices
, axis
, **kwargs
)
+(same type as caller)
+— Return the elements in the given positional indices along an axis.</>to_clipboard
(
excel
, sep
, **kwargs
)
+
+— Copy object to the system clipboard.</>to_csv
(
path_or_buf
, sep
, na_rep
, float_format
, columns
, header
, index
, index_label
, mode
, encoding
, compression
, quoting
, quotechar
, lineterminator
, chunksize
, date_format
, doublequote
, escapechar
, decimal
, errors
, storage_options
)
+(None or str)
+— Write object to a comma-separated values (csv) file.</>to_dict
(
orient
, into
, index
)
+(dict, list or collections.abc.MutableMapping)
+— Convert the DataFrame to a dictionary.</>to_excel
(
excel_writer
, sheet_name
, na_rep
, float_format
, columns
, header
, index
, index_label
, startrow
, startcol
, engine
, merge_cells
, inf_rep
, freeze_panes
, storage_options
, engine_kwargs
)
+
+— Write object to an Excel sheet.</>to_feather
(
path
, **kwargs
)
+
+— Write a DataFrame to the binary Feather format.</>to_gbq
(
destination_table
, project_id
, chunksize
, reauth
, if_exists
, auth_local_webserver
, table_schema
, location
, progress_bar
, credentials
)
+
+— Write a DataFrame to a Google BigQuery table.</>to_hdf
(
path_or_buf
, key
, mode
, complevel
, complib
, append
, format
, index
, min_itemsize
, nan_rep
, dropna
, data_columns
, errors
, encoding
)
+
+— Write the contained data to an HDF5 file using HDFStore.</>to_html
(
buf
, columns
, col_space
, header
, index
, na_rep
, formatters
, float_format
, sparsify
, index_names
, justify
, max_rows
, max_cols
, show_dimensions
, decimal
, bold_rows
, classes
, escape
, notebook
, border
, table_id
, render_links
, encoding
)
+(str or None)
+— Render a DataFrame as an HTML table.</>to_json
(
path_or_buf
, orient
, date_format
, double_precision
, force_ascii
, date_unit
, default_handler
, lines
, compression
, index
, indent
, storage_options
, mode
)
+(None or str)
+— Convert the object to a JSON string.</>to_latex
(
buf
, columns
, header
, index
, na_rep
, formatters
, float_format
, sparsify
, index_names
, bold_rows
, column_format
, longtable
, escape
, encoding
, decimal
, multicolumn
, multicolumn_format
, multirow
, caption
, label
, position
)
+(str or None)
+— Render object to a LaTeX tabular, longtable, or nested table.</>to_markdown
(
buf
, mode
, index
, storage_options
, **kwargs
)
+(str)
+— Print DataFrame in Markdown-friendly format.</>to_numpy
(
dtype
, copy
, na_value
)
+(numpy.ndarray)
+— Convert the DataFrame to a NumPy array.</>to_orc
(
path
, engine
, index
, engine_kwargs
)
+(bytes if no path argument is provided else None)
+— Write a DataFrame to the ORC format.</>to_parquet
(
path
, engine
, compression
, index
, partition_cols
, storage_options
, **kwargs
)
+(bytes if no path argument is provided else None)
+— Write a DataFrame to the binary parquet format.</>to_period
(
freq
, axis
, copy
)
+(DataFrame)
+— Convert DataFrame from DatetimeIndex to PeriodIndex.</>to_pickle
(
path
, compression
, protocol
, storage_options
)
+
+— Pickle (serialize) object to file.</>to_records
(
index
, column_dtypes
, index_dtypes
)
+(numpy.rec.recarray)
+— Convert DataFrame to a NumPy record array.</>to_sql
(
name
, con
, schema
, if_exists
, index
, index_label
, chunksize
, dtype
, method
)
+(None or int)
+— Write records stored in a DataFrame to a SQL database.</>to_stata
(
path
, convert_dates
, write_index
, byteorder
, time_stamp
, data_label
, variable_labels
, version
, convert_strl
, compression
, storage_options
, value_labels
)
+
+— Export DataFrame object to Stata dta format.</>to_string
(
buf
, columns
, col_space
, header
, index
, na_rep
, formatters
, float_format
, sparsify
, index_names
, justify
, max_rows
, max_cols
, show_dimensions
, decimal
, line_width
, min_rows
, max_colwidth
, encoding
)
+(str or None)
+— Render a DataFrame to a console-friendly tabular output.</>to_timestamp
(
freq
, how
, axis
, copy
)
+(DataFrame)
+— Cast to DatetimeIndex of timestamps, at beginning of period.</>to_xarray
(
)
+(xarray.DataArray or xarray.Dataset)
+— Return an xarray object from the pandas object.</>to_xml
(
path_or_buffer
, index
, root_name
, row_name
, na_rep
, attr_cols
, elem_cols
, namespaces
, prefix
, encoding
, xml_declaration
, pretty_print
, parser
, stylesheet
, compression
, storage_options
)
+(None or str)
+— Render a DataFrame to an XML document.</>transform
(
func
, axis
, *args
, **kwargs
)
+(DataFrame)
+— Call func
on self producing a DataFrame with the same axis shape as self.</>transpose
(
*args
, copy
)
+(DataFrame)
+— Transpose index and columns.</>truediv
(
other
, axis
, level
, fill_value
)
+(DataFrame)
+— Get Floating division of dataframe and other, element-wise (binary operator truediv
).</>truncate
(
before
, after
, axis
, copy
)
+(type of caller)
+— Truncate a Series or DataFrame before and after some index value.</>tz_convert
(
tz
, axis
, level
, copy
)
+(Series/DataFrame)
+— Convert tz-aware axis to target time zone.</>tz_localize
(
tz
, axis
, level
, copy
, ambiguous
, nonexistent
)
+(Series/DataFrame)
+— Localize tz-naive index of a Series or DataFrame to target time zone.</>unstack
(
level
, fill_value
, sort
)
+(Series or DataFrame)
+— Pivot a level of the (necessarily hierarchical) index labels.</>update
(
other
, join
, overwrite
, filter_func
, errors
)
+(None)
+— Modify in place using non-NA values from another DataFrame.</>value_counts
(
subset
, normalize
, sort
, ascending
, dropna
)
+(Series)
+— Return a Series containing the frequency of each distinct row in the Dataframe.</>var
(
axis
, skipna
, ddof
, numeric_only
, **kwargs
)
+(Series or DataFrame (if level specified))
+— Return unbiased variance over requested axis.</>where
(
cond
, other
, inplace
, axis
, level
)
+(Same type as caller or None if ``inplace=True``.)
+— Replace values where the condition is False.</>xs
(
key
, axis
, level
, drop_level
)
+(Series or DataFrame)
+— Return cross-section from the Series/DataFrame.</>__add__
(
other
)
Get Addition of DataFrame and other, column-wise.
Equivalent to DataFrame.add(other)
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Object to be added to the DataFrame.The result of adding other
to DataFrame.
DataFrame.add : Add a DataFrame and another object, with option for index- or column-oriented addition.
+>>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]},... index=['elk', 'moose'])
+>>> df
+ height weight
+elk 1.5 500
+moose 2.6 800
+
Adding a scalar affects all rows and columns.
+>>> df[['height', 'weight']] + 1.5
+ height weight
+elk 3.0 501.5
+moose 4.1 801.5
+
Each element of a list is added to a column of the DataFrame, in order.
+>>> df[['height', 'weight']] + [0.5, 1.5]
+ height weight
+elk 2.0 501.5
+moose 3.1 801.5
+
Keys of a dictionary are aligned to the DataFrame, based on column names; +each value in the dictionary is added to the corresponding column.
+>>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5}
+ height weight
+elk 2.0 501.5
+moose 3.1 801.5
+
When other
is a :class:Series
, the index of other
is aligned with the
+columns of the DataFrame.
>>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height'])
+>>> df[['height', 'weight']] + s1
+ height weight
+elk 3.0 500.5
+moose 4.1 800.5
+
Even when the index of other
is the same as the index of the DataFrame,
+the :class:Series
will not be reoriented. If index-wise alignment is desired,
+:meth:DataFrame.add
should be used with axis='index'
.
>>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose'])
+>>> df[['height', 'weight']] + s2
+ elk height moose weight
+elk NaN NaN NaN NaN
+moose NaN NaN NaN NaN
+
>>> df[['height', 'weight']].add(s2, axis='index')
+ height weight
+elk 2.0 500.5
+moose 4.1 801.5
+
When other
is a :class:DataFrame
, both columns names and the
+index are aligned.
>>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]},
+... index=['elk', 'moose', 'deer'])
+>>> df[['height', 'weight']] + other
+ height weight
+deer NaN NaN
+elk 1.7 NaN
+moose 3.0 NaN
+
__dir__
(
)
→ listProvide method name lookup and completion.
Notes
+Only provide 'public' methods.
+__sizeof__
(
)
→ intGenerates the total memory usage for an object that returnseither a value or Series of values
+set_flags
(
copy=False
, allows_duplicate_labels=None
)
Return a new object with updated flags.
copy
+(bool, default False)
+— Specify if a copy of the object should be made.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+allows_duplicate_labels
+(bool, optional)
+— Whether the returned object allows duplicate labels.The same type as the caller.
DataFrame.attrs : Global metadata applying to this dataset.DataFrame.flags : Global flags applying to this object.
+Notes
+This method returns a new object that's a view on the same data +as the input. Mutating the input or the output values will be reflected +in the other.
+This method is intended to be used in method chains.
+"Flags" differ from "metadata". Flags reflect properties of the
+pandas object (the Series or DataFrame). Metadata refer to properties
+of the dataset, and should be stored in :attr:DataFrame.attrs
.
>>> df = pd.DataFrame({"A": [1, 2]})>>> df.flags.allows_duplicate_labels
+True
+>>> df2 = df.set_flags(allows_duplicate_labels=False)
+>>> df2.flags.allows_duplicate_labels
+False
+
swapaxes
(
axis1
, axis2
, copy=None
)
Interchange axes and swap values axes appropriately.
.. deprecated:: 2.1.0
+ swapaxes
is deprecated and will be removed.
+ Please use transpose
instead.
Please see examples for :meth:DataFrame.transpose
.
droplevel
(
level
, axis=0
)
Return Series/DataFrame with requested index / column level(s) removed.
level
+(int, str, or list-like)
+— If a string is given, must be the name of a levelIf list-like, elements must be names or positional indexes
+of levels.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Axis along which the level(s) is removed:Series
this parameter is unused and defaults to 0.
+Series/DataFrame with requested index / column level(s) removed.
>>> df = pd.DataFrame([... [1, 2, 3, 4],
+... [5, 6, 7, 8],
+... [9, 10, 11, 12]
+... ]).set_index([0, 1]).rename_axis(['a', 'b'])
+
>>> df.columns = pd.MultiIndex.from_tuples([
+... ('c', 'e'), ('d', 'f')
+... ], names=['level_1', 'level_2'])
+
>>> df
+level_1 c d
+level_2 e f
+a b
+1 2 3 4
+5 6 7 8
+9 10 11 12
+
>>> df.droplevel('a')
+level_1 c d
+level_2 e f
+b
+2 3 4
+6 7 8
+10 11 12
+
>>> df.droplevel('level_2', axis=1)
+level_1 c d
+a b
+1 2 3 4
+5 6 7 8
+9 10 11 12
+
squeeze
(
axis=None
)
Squeeze 1 dimensional axis objects into scalars.
Series or DataFrames with a single element are squeezed to a scalar. +DataFrames with a single column or a single row are squeezed to a +Series. Otherwise the object is unchanged.
+This method is most useful when you don't know if your
+object is a Series or DataFrame, but you do know it has just a single
+column. In that case you can safely call squeeze
to ensure you have a
+Series.
axis
+({0 or 'index', 1 or 'columns', None}, default None)
+— A specific axis to squeeze. By default, all length-1 axes aresqueezed. For Series
this parameter is unused and defaults to None
.
+The projection after squeezing axis
or all the axes.
Series.iloc : Integer-location based indexing for selecting scalars.DataFrame.iloc : Integer-location based indexing for selecting Series. +Series.to_frame : Inverse of DataFrame.squeeze for a + single-column DataFrame.
+>>> primes = pd.Series([2, 3, 5, 7])
Slicing might produce a Series with a single value:
+>>> even_primes = primes[primes % 2 == 0]
+>>> even_primes
+0 2
+dtype: int64
+
>>> even_primes.squeeze()
+2
+
Squeezing objects with more than one value in every axis does nothing:
+>>> odd_primes = primes[primes % 2 == 1]
+>>> odd_primes
+1 3
+2 5
+3 7
+dtype: int64
+
>>> odd_primes.squeeze()
+1 3
+2 5
+3 7
+dtype: int64
+
Squeezing is even more effective when used with DataFrames.
+>>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
+>>> df
+ a b
+0 1 2
+1 3 4
+
Slicing a single column will produce a DataFrame with the columns +having only one value:
+>>> df_a = df[['a']]
+>>> df_a
+ a
+0 1
+1 3
+
So the columns can be squeezed down, resulting in a Series:
+>>> df_a.squeeze('columns')
+0 1
+1 3
+Name: a, dtype: int64
+
Slicing a single row from a single column will produce a single +scalar DataFrame:
+>>> df_0a = df.loc[df.index < 1, ['a']]
+>>> df_0a
+ a
+0 1
+
Squeezing the rows produces a single scalar Series:
+>>> df_0a.squeeze('rows')
+a 1
+Name: 0, dtype: int64
+
Squeezing all axes will project directly into a scalar:
+>>> df_0a.squeeze()
+1
+
rename_axis
(
mapper=<no_default>
, index=<no_default>
, columns=<no_default>
, axis=0
, copy=None
, inplace=False
)
Set the name of the axis for the index or columns.
mapper
+(scalar, list-like, optional)
+— Value to set the axis name attribute.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to rename. For Series
this parameter is unused and defaults to 0.copy
+(bool, default None)
+— Also copy underlying data.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+inplace
+(bool, default False)
+— Modifies the object directly, instead of creating a new Seriesor DataFrame.
+The same type as the caller or None if inplace=True
.
Series.rename : Alter Series index labels or name.DataFrame.rename : Alter DataFrame index labels or name. +Index.rename : Set new names on index.
+Notes
+DataFrame.rename_axis
supports two calling conventions
(index=index_mapper, columns=columns_mapper, ...)
(mapper, axis={'index', 'columns'}, ...)
The first calling convention will only modify the names of
+the index and/or the names of the Index object that is the columns.
+In this case, the parameter copy
is ignored.
The second calling convention will modify the names of the +corresponding index if mapper is a list or a scalar. +However, if mapper is dict-like or a function, it will use the +deprecated behavior of modifying the axis labels.
+We highly recommend using keyword arguments to clarify your +intent.
+Series
>>> s = pd.Series(["dog", "cat", "monkey"])
+>>> s
+0 dog
+1 cat
+2 monkey
+dtype: object
+>>> s.rename_axis("animal")
+animal
+0 dog
+1 cat
+2 monkey
+dtype: object
+
DataFrame
+>>> df = pd.DataFrame({"num_legs": [4, 4, 2],
+... "num_arms": [0, 0, 2]},
+... ["dog", "cat", "monkey"])
+>>> df
+ num_legs num_arms
+dog 4 0
+cat 4 0
+monkey 2 2
+>>> df = df.rename_axis("animal")
+>>> df
+ num_legs num_arms
+animal
+dog 4 0
+cat 4 0
+monkey 2 2
+>>> df = df.rename_axis("limbs", axis="columns")
+>>> df
+limbs num_legs num_arms
+animal
+dog 4 0
+cat 4 0
+monkey 2 2
+
MultiIndex
+>>> df.index = pd.MultiIndex.from_product([['mammal'],
+... ['dog', 'cat', 'monkey']],
+... names=['type', 'name'])
+>>> df
+limbs num_legs num_arms
+type name
+mammal dog 4 0
+ cat 4 0
+ monkey 2 2
+
>>> df.rename_axis(index={'type': 'class'})
+limbs num_legs num_arms
+class name
+mammal dog 4 0
+ cat 4 0
+ monkey 2 2
+
>>> df.rename_axis(columns=str.upper)
+LIMBS num_legs num_arms
+type name
+mammal dog 4 0
+ cat 4 0
+ monkey 2 2
+
equals
(
other
)
Test whether two objects contain the same elements.
This function allows two Series or DataFrames to be compared against +each other to see if they have the same shape and elements. NaNs in +the same location are considered equal.
+The row/column index do not need to have the same type, as long +as the values are considered equal. Corresponding columns and +index must be of the same dtype.
+other
+(Series or DataFrame)
+— The other Series or DataFrame to be compared with the first.True if all elements are the same in both objects, Falseotherwise.
+Series.eq : Compare two Series objects of the same length and return a Series where each element is True if the element + in each Series is equal, False otherwise. +DataFrame.eq : Compare two DataFrame objects of the same shape and + return a DataFrame where each element is True if the respective + element in each DataFrame is equal, False otherwise. +testing.assert_series_equal : Raises an AssertionError if left and + right are not equal. Provides an easy interface to ignore + inequality in dtypes, indexes and precision among others. +testing.assert_frame_equal : Like assert_series_equal, but targets + DataFrames. +numpy.array_equal : Return True if two arrays have the same shape + and elements, False otherwise.
+>>> df = pd.DataFrame({1: [10], 2: [20]})>>> df
+ 1 2
+0 10 20
+
DataFrames df and exactly_equal have the same types and values for +their elements and column labels, which will return True.
+>>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
+>>> exactly_equal
+ 1 2
+0 10 20
+>>> df.equals(exactly_equal)
+True
+
DataFrames df and different_column_type have the same element +types and values, but have different types for the column labels, +which will still return True.
+>>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
+>>> different_column_type
+ 1.0 2.0
+0 10 20
+>>> df.equals(different_column_type)
+True
+
DataFrames df and different_data_type have different types for the +same values for their elements, and will return False even though +their column labels are the same values and types.
+>>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
+>>> different_data_type
+ 1 2
+0 10.0 20.0
+>>> df.equals(different_data_type)
+False
+
bool
(
)
Return the bool of a single element Series or DataFrame.
.. deprecated:: 2.1.0
+bool is deprecated and will be removed in future version of pandas.
+ For Series
use pandas.Series.item
.
This must be a boolean scalar value, either True or False. It will raise a +ValueError if the Series or DataFrame does not have exactly 1 element, or that +element is not boolean (integer values 0 and 1 will also raise an exception).
+The value in the Series or DataFrame.
Series.astype : Change the data type of a Series, including to boolean.DataFrame.astype : Change the data type of a DataFrame, including to boolean. +numpy.bool_ : NumPy boolean data type, used by pandas for boolean values.
+The method will only work for single element objects with a boolean value:
>>> pd.Series([True]).bool() # doctest: +SKIP
+True
+>>> pd.Series([False]).bool() # doctest: +SKIP
+False
+
>>> pd.DataFrame({'col': [True]}).bool() # doctest: +SKIP
+True
+>>> pd.DataFrame({'col': [False]}).bool() # doctest: +SKIP
+False
+
This is an alternative method and will only work +for single element objects with a boolean value:
+>>> pd.Series([True]).item() # doctest: +SKIP
+True
+>>> pd.Series([False]).item() # doctest: +SKIP
+False
+
abs
(
)
Return a Series/DataFrame with absolute numeric value of each element.
This function only applies to elements that are all numeric.
+Series/DataFrame containing the absolute value of each element.
numpy.absolute : Calculate the absolute value element-wise.
Notes
+For complex
inputs, 1.2 + 1j
, the absolute value is
+:math:\sqrt{ a^2 + b^2 }
.
Absolute numeric values in a Series.
>>> s = pd.Series([-1.10, 2, -3.33, 4])
+>>> s.abs()
+0 1.10
+1 2.00
+2 3.33
+3 4.00
+dtype: float64
+
Absolute numeric values in a Series with complex numbers.
+>>> s = pd.Series([1.2 + 1j])
+>>> s.abs()
+0 1.56205
+dtype: float64
+
Absolute numeric values in a Series with a Timedelta element.
+>>> s = pd.Series([pd.Timedelta('1 days')])
+>>> s.abs()
+0 1 days
+dtype: timedelta64[ns]
+
Select rows with data closest to certain value using argsort (from
+StackOverflow <https://stackoverflow.com/a/17758115>
__).
>>> df = pd.DataFrame({
+... 'a': [4, 5, 6, 7],
+... 'b': [10, 20, 30, 40],
+... 'c': [100, 50, -30, -50]
+... })
+>>> df
+ a b c
+0 4 10 100
+1 5 20 50
+2 6 30 -30
+3 7 40 -50
+>>> df.loc[(df.c - 43).abs().argsort()]
+ a b c
+1 5 20 50
+0 4 10 100
+2 6 30 -30
+3 7 40 -50
+
__iter__
(
)
Iterate over info axis.
Info axis as iterator.
>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})>>> for x in df:
+... print(x)
+A
+B
+
keys
(
)
Get the 'info axis' (see Indexing for more).
This is index for Series, columns for DataFrame.
+Info axis.
>>> d = pd.DataFrame(data={'A': [1, 2, 3], 'B': [0, 4, 8]},... index=['a', 'b', 'c'])
+>>> d
+ A B
+a 1 0
+b 2 4
+c 3 8
+>>> d.keys()
+Index(['A', 'B'], dtype='object')
+
__contains__
(
key
)
→ boolTrue if the key is in the info axis
to_excel
(
excel_writer
, sheet_name='Sheet1'
, na_rep=''
, float_format=None
, columns=None
, header=True
, index=True
, index_label=None
, startrow=0
, startcol=0
, engine=None
, merge_cells=True
, inf_rep='inf'
, freeze_panes=None
, storage_options=None
, engine_kwargs=None
)
Write object to an Excel sheet.
To write a single object to an Excel .xlsx file it is only necessary to
+specify a target file name. To write to multiple sheets it is necessary to
+create an ExcelWriter
object with a target file name, and specify a sheet
+in the file to write to.
Multiple sheets may be written to by specifying unique sheet_name
.
+With all data written to the file it is necessary to save the changes.
+Note that creating an ExcelWriter
object with a file name that already
+exists will result in the contents of the existing file being erased.
excel_writer
+(path-like, file-like, or ExcelWriter object)
+— File path or existing ExcelWriter.sheet_name
+(str, default 'Sheet1')
+— Name of sheet which will contain DataFrame.na_rep
+(str, default '')
+— Missing data representation.float_format
+(str, optional)
+— Format string for floating point numbers. For examplefloat_format="%.2f"
will format 0.1234 to 0.12.
+columns
+(sequence or list of str, optional)
+— Columns to write.header
+(bool or list of str, default True)
+— Write out the column names. If a list of string is given it isassumed to be aliases for the column names.
+index
+(bool, default True)
+— Write row names (index).index_label
+(str or sequence, optional)
+— Column label for index column(s) if desired. If not specified, andheader
and index
are True, then the index names are used. A
+sequence should be given if the DataFrame uses MultiIndex.
+startrow
+(int, default 0)
+— Upper left cell row to dump data frame.startcol
+(int, default 0)
+— Upper left cell column to dump data frame.engine
+(str, optional)
+— Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set thisvia the options io.excel.xlsx.writer
or
+io.excel.xlsm.writer
.
+merge_cells
+(bool, default True)
+— Write MultiIndex and Hierarchical Rows as merged cells.inf_rep
+(str, default 'inf')
+— Representation for infinity (there is no native representation forinfinity in Excel).
+freeze_panes
+(tuple of int (length 2), optional)
+— Specifies the one-based bottommost row and rightmost column thatis to be frozen.
+storage_options
+(dict, optional)
+— Extra options that make sense for a particular storage connection, e.g.host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+are forwarded to urllib.request.Request
as header options. For other
+URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+forwarded to fsspec.open
. Please see fsspec
and urllib
for more
+details, and for more examples on storage options refer here
+<https://pandas.pydata.org/docs/user_guide/io.html?
+highlight=storage_options#reading-writing-remote-files>
_.engine_kwargs
+(dict, optional)
+— Arbitrary keyword arguments passed to excel engine.to_csv : Write DataFrame to a comma-separated values (csv) file.ExcelWriter : Class for writing DataFrame objects into excel sheets. +read_excel : Read an Excel file into a pandas DataFrame. +read_csv : Read a comma-separated values (csv) file into DataFrame. +io.formats.style.Styler.to_excel : Add styles to Excel sheet.
+Notes
+For compatibility with :meth:~DataFrame.to_csv
,
+to_excel serializes lists and dicts to strings before writing.
Once a workbook has been saved it is not possible to write further +data without rewriting the whole workbook.
+:
, +, +) +P
+:
+, +P
+s +:
+) +P +) +)
+:
+, +P +)
+, +s +:
+P
+to_json
(
path_or_buf=None
, orient=None
, date_format=None
, double_precision=10
, force_ascii=True
, date_unit='ms'
, default_handler=None
, lines=False
, compression='infer'
, index=None
, indent=None
, storage_options=None
, mode='w'
)
Convert the object to a JSON string.
Note NaN's and None will be converted to null and datetime objects +will be converted to UNIX timestamps.
+path_or_buf
+(str, path object, file-like object, or None, default None)
+— String, path object (implementing os.PathLike[str]), or file-likeobject implementing a write() function. If None, the result is
+returned as a string.
+orient
+(str)
+— Indication of expected JSON string format.orient='records'
.date_format
+({None, 'epoch', 'iso'})
+— Type of date conversion. 'epoch' = epoch milliseconds,'iso' = ISO8601. The default depends on the orient
. For
+orient='table'
, the default is 'iso'. For all other orients,
+the default is 'epoch'.
+double_precision
+(int, default 10)
+— The number of decimal places to use when encodingfloating point values. The possible maximal value is 15.
+Passing double_precision greater than 15 will raise a ValueError.
+force_ascii
+(bool, default True)
+— Force encoded string to be ASCII.date_unit
+(str, default 'ms' (milliseconds))
+— The time unit to encode to, governs timestamp and ISO8601precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
+microsecond, and nanosecond respectively.
+default_handler
+(callable, default None)
+— Handler to call if object cannot otherwise be converted to asuitable format for JSON. Should receive a single argument which is
+the object to convert and return a serialisable object.
+lines
+(bool, default False)
+— If 'orient' is 'records' write out line-delimited json format. Willthrow ValueError if incorrect 'orient' since others are not
+list-like.
+compression
+(str or dict, default 'infer')
+— For on-the-fly compression of the output data. If 'infer' and 'path_or_buf' ispath-like, then detect compression from the following extensions: '.gz',
+'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+(otherwise no compression).
+Set to None
for no compression.
+Can also be a dict with key 'method'
set
+to one of {'zip'
, 'gzip'
, 'bz2'
, 'zstd'
, 'xz'
, 'tar'
} and
+other key-value pairs are forwarded to
+zipfile.ZipFile
, gzip.GzipFile
,
+bz2.BZ2File
, zstandard.ZstdCompressor
, lzma.LZMAFile
or
+tarfile.TarFile
, respectively.
+As an example, the following could be passed for faster compression and to create
+a reproducible gzip archive:
+compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}
..tar
files.index
+(bool or None, default None)
+— The index is only used when 'orient' is 'split', 'index', 'column',or 'table'. Of these, 'index' and 'column' do not support
+index=False
.
+indent
+(int, optional)
+— Length of whitespace used to indent each record.storage_options
+(dict, optional)
+— Extra options that make sense for a particular storage connection, e.g.host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+are forwarded to urllib.request.Request
as header options. For other
+URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+forwarded to fsspec.open
. Please see fsspec
and urllib
for more
+details, and for more examples on storage options refer here
+<https://pandas.pydata.org/docs/user_guide/io.html?
+highlight=storage_options#reading-writing-remote-files>
_.
+mode
+(str, default 'w' (writing))
+— Specify the IO mode for output when supplying a path_or_buf.Accepted args are 'w' (writing) and 'a' (append) only.
+mode='a' is only supported when lines is True and orient is 'records'.
+If path_or_buf is None, returns the resulting json format as astring. Otherwise returns None.
+read_json : Convert a JSON string to pandas object.
Notes
+The behavior of indent=0
varies from the stdlib, which does not
+indent the output but does insert newlines. Currently, indent=0
+and the default indent=None
are equivalent in pandas, though this
+may change in a future release.
orient='table'
contains a 'pandas_version' field under 'schema'.
+This stores the version of pandas
used in the latest revision of the
+schema.
>>> from json import loads, dumps>>> df = pd.DataFrame(
+... [["a", "b"], ["c", "d"]],
+... index=["row 1", "row 2"],
+... columns=["col 1", "col 2"],
+... )
+
>>> result = df.to_json(orient="split")
+>>> parsed = loads(result)
+>>> dumps(parsed, indent=4) # doctest: +SKIP
+{
+ "columns": [
+ "col 1",
+ "col 2"
+ ],
+ "index": [
+ "row 1",
+ "row 2"
+ ],
+ "data": [
+ [
+ "a",
+ "b"
+ ],
+ [
+ "c",
+ "d"
+ ]
+ ]
+}
+
Encoding/decoding a Dataframe using 'records'
formatted JSON.
+Note that index labels are not preserved with this encoding.
>>> result = df.to_json(orient="records")
+>>> parsed = loads(result)
+>>> dumps(parsed, indent=4) # doctest: +SKIP
+[
+ {
+ "col 1": "a",
+ "col 2": "b"
+ },
+ {
+ "col 1": "c",
+ "col 2": "d"
+ }
+]
+
Encoding/decoding a Dataframe using 'index'
formatted JSON:
>>> result = df.to_json(orient="index")
+>>> parsed = loads(result)
+>>> dumps(parsed, indent=4) # doctest: +SKIP
+{
+ "row 1": {
+ "col 1": "a",
+ "col 2": "b"
+ },
+ "row 2": {
+ "col 1": "c",
+ "col 2": "d"
+ }
+}
+
Encoding/decoding a Dataframe using 'columns'
formatted JSON:
>>> result = df.to_json(orient="columns")
+>>> parsed = loads(result)
+>>> dumps(parsed, indent=4) # doctest: +SKIP
+{
+ "col 1": {
+ "row 1": "a",
+ "row 2": "c"
+ },
+ "col 2": {
+ "row 1": "b",
+ "row 2": "d"
+ }
+}
+
Encoding/decoding a Dataframe using 'values'
formatted JSON:
>>> result = df.to_json(orient="values")
+>>> parsed = loads(result)
+>>> dumps(parsed, indent=4) # doctest: +SKIP
+[
+ [
+ "a",
+ "b"
+ ],
+ [
+ "c",
+ "d"
+ ]
+]
+
Encoding with Table Schema:
+>>> result = df.to_json(orient="table")
+>>> parsed = loads(result)
+>>> dumps(parsed, indent=4) # doctest: +SKIP
+{
+ "schema": {
+ "fields": [
+ {
+ "name": "index",
+ "type": "string"
+ },
+ {
+ "name": "col 1",
+ "type": "string"
+ },
+ {
+ "name": "col 2",
+ "type": "string"
+ }
+ ],
+ "primaryKey": [
+ "index"
+ ],
+ "pandas_version": "1.4.0"
+ },
+ "data": [
+ {
+ "index": "row 1",
+ "col 1": "a",
+ "col 2": "b"
+ },
+ {
+ "index": "row 2",
+ "col 1": "c",
+ "col 2": "d"
+ }
+ ]
+}
+
to_hdf
(
path_or_buf
, key
, mode='a'
, complevel=None
, complib=None
, append=False
, format=None
, index=True
, min_itemsize=None
, nan_rep=None
, dropna=None
, data_columns=None
, errors='strict'
, encoding='UTF-8'
)
Write the contained data to an HDF5 file using HDFStore.
Hierarchical Data Format (HDF) is self-describing, allowing an +application to interpret the structure and contents of a file with +no outside information. One HDF file can hold a mix of related objects +which can be accessed as a group or as individual objects.
+In order to add another DataFrame or Series to an existing HDF file +please use append mode and a different a key.
+.. warning::
+One can store a subclass of DataFrame
or Series
to HDF5,
+ but the type of the subclass is lost upon storing.
For more information see the :ref:user guide <io.hdf5>
.
path_or_buf
+(str or pandas.HDFStore)
+— File path or HDFStore object.key
+(str)
+— Identifier for the group in the store.mode
+({'a', 'w', 'r+'}, default 'a')
+— Mode to open file:complevel
+({0-9}, default None)
+— Specifies a compression level for data.A value of 0 or None disables compression.
+complib
+({'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib')
+— Specifies the compression library to be used.These additional compressors for Blosc are supported
+(default if no compressor specified: 'blosc:blosclz'):
+{'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
+'blosc:zlib', 'blosc:zstd'}.
+Specifying a compression library which is not available issues
+a ValueError.
+append
+(bool, default False)
+— For Table formats, append the input data to the existing.format
+({'fixed', 'table', None}, default 'fixed')
+— Possible values:index
+(bool, default True)
+— Write DataFrame index as a column.min_itemsize
+(dict or int, optional)
+— Map column names to minimum string sizes for columns.nan_rep
+(Any, optional)
+— How to represent null values as str.Not allowed with append=True.
+dropna
+(bool, default False, optional)
+— Remove missing values.data_columns
+(list of columns or True, optional)
+— List of columns to create as indexed data columns for on-diskqueries, or True to use all columns. By default only the axes
+of the object are indexed. See
+:ref:Query via data columns<io.hdf5-query-data-columns>
. for
+more information.
+Applicable only to format='table'.
+errors
+(str, default 'strict')
+— Specifies how encoding and decoding errors are to be handled.See the errors argument for :func:open
for a full list
+of options.
+read_hdf : Read from HDF file.DataFrame.to_orc : Write a DataFrame to the binary orc format. +DataFrame.to_parquet : Write a DataFrame to the binary parquet format. +DataFrame.to_sql : Write to a SQL table. +DataFrame.to_feather : Write out feather-format for DataFrames. +DataFrame.to_csv : Write out to a csv file.
+>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},... index=['a', 'b', 'c']) # doctest: +SKIP
+>>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP
+
We can add another object to the same file:
+>>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP
+>>> s.to_hdf('data.h5', key='s') # doctest: +SKIP
+
Reading from HDF file:
+>>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP
+A B
+a 1 4
+b 2 5
+c 3 6
+>>> pd.read_hdf('data.h5', 's') # doctest: +SKIP
+0 1
+1 2
+2 3
+3 4
+dtype: int64
+
to_sql
(
name
, con
, schema=None
, if_exists='fail'
, index=True
, index_label=None
, chunksize=None
, dtype=None
, method=None
)
Write records stored in a DataFrame to a SQL database.
Databases supported by SQLAlchemy [1]_ are supported. Tables can be +newly created, appended to, or overwritten.
+name
+(str)
+— Name of SQL table.con
+(sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection)
+— Using SQLAlchemy makes it possible to use any DB supported by thatlibrary. Legacy support is provided for sqlite3.Connection objects. The user
+is responsible for engine disposal and connection closure for the SQLAlchemy
+connectable. See here <https://docs.sqlalchemy.org/en/20/core/connections.html>
_.
+If passing a sqlalchemy.engine.Connection which is already in a transaction,
+the transaction will not be committed. If passing a sqlite3.Connection,
+it will not be possible to roll back the record insertion.
+schema
+(str, optional)
+— Specify the schema (if database flavor supports this). If None, usedefault schema.
+if_exists
+({'fail', 'replace', 'append'}, default 'fail')
+— How to behave if the table already exists.index
+(bool, default True)
+— Write DataFrame index as a column. Uses index_label
as the columnname in the table. Creates a table index for this column.
+index_label
+(str or sequence, default None)
+— Column label for index column(s). If None is given (default) andindex
is True, then the index names are used.
+A sequence should be given if the DataFrame uses MultiIndex.
+chunksize
+(int, optional)
+— Specify the number of rows in each batch to be written at a time.By default, all rows will be written at once.
+dtype
+(dict or scalar, optional)
+— Specifying the datatype for columns. If a dictionary is used, thekeys should be the column names and the values should be the
+SQLAlchemy types or strings for the sqlite3 legacy mode. If a
+scalar is provided, it will be applied to all columns.
+method
+({None, 'multi', callable}, optional)
+— Controls the SQL insertion clause used:INSERT
clause (one per row).INSERT
clause.(pd_table, conn, keys, data_iter)
.insert method <io.sql.method>
.
+Number of rows affected by to_sql. None is returned if the callablepassed into method
does not return an integer number of rows.
The number of returned rows affected is the sum of the rowcount
+attribute of sqlite3.Cursor
or SQLAlchemy connectable which may not
+reflect the exact number of written rows as stipulated in the
+sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>
or
+SQLAlchemy <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.CursorResult.rowcount>
.
.. versionadded:: 1.4.0
+ValueError
+
+— When the table already exists and if_exists
is 'fail' (thedefault).
+read_sql : Read a DataFrame from a table.
Notes
+Timezone aware datetime columns will be written as
+Timestamp with timezone
type with SQLAlchemy if supported by the
+database. Otherwise, the datetimes will be stored as timezone unaware
+timestamps local to the original timezone.
Not all datastores support method="multi"
. Oracle, for example,
+does not support multi-value insert.
.. [1] https://docs.sqlalchemy.org.. [2] https://www.python.org/dev/peps/pep-0249/
+Create an in-memory SQLite database.
>>> from sqlalchemy import create_engine
+>>> engine = create_engine('sqlite://', echo=False)
+
Create a table from scratch with 3 rows.
+>>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
+>>> df
+ name
+0 User 1
+1 User 2
+2 User 3
+
>>> df.to_sql(name='users', con=engine)
+3
+>>> from sqlalchemy import text
+>>> with engine.connect() as conn:
+... conn.execute(text("SELECT * FROM users")).fetchall()
+[(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
+
An sqlalchemy.engine.Connection
can also be passed to con
:
>>> with engine.begin() as connection:
+... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
+... df1.to_sql(name='users', con=connection, if_exists='append')
+2
+
This is allowed to support operations that require that the same +DBAPI connection is used for the entire operation.
+>>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']})
+>>> df2.to_sql(name='users', con=engine, if_exists='append')
+2
+>>> with engine.connect() as conn:
+... conn.execute(text("SELECT * FROM users")).fetchall()
+[(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
+ (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),
+ (1, 'User 7')]
+
Overwrite the table with just df2
.
>>> df2.to_sql(name='users', con=engine, if_exists='replace',
+... index_label='id')
+2
+>>> with engine.connect() as conn:
+... conn.execute(text("SELECT * FROM users")).fetchall()
+[(0, 'User 6'), (1, 'User 7')]
+
Use method
to define a callable insertion method to do nothing
+if there's a primary key conflict on a table in a PostgreSQL database.
>>> from sqlalchemy.dialects.postgresql import insert
+>>> def insert_on_conflict_nothing(table, conn, keys, data_iter):
+... # "a" is the primary key in "conflict_table"
+... data = [dict(zip(keys, row)) for row in data_iter]
+... stmt = insert(table.table).values(data).on_conflict_do_nothing(index_elements=["a"])
+... result = conn.execute(stmt)
+... return result.rowcount
+>>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_nothing) # doctest: +SKIP
+0
+
For MySQL, a callable to update columns b
and c
if there's a conflict
+on a primary key.
>>> from sqlalchemy.dialects.mysql import insert
+>>> def insert_on_conflict_update(table, conn, keys, data_iter):
+... # update columns "b" and "c" on primary key conflict
+... data = [dict(zip(keys, row)) for row in data_iter]
+... stmt = (
+... insert(table.table)
+... .values(data)
+... )
+... stmt = stmt.on_duplicate_key_update(b=stmt.inserted.b, c=stmt.inserted.c)
+... result = conn.execute(stmt)
+... return result.rowcount
+>>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_update) # doctest: +SKIP
+2
+
Specify the dtype (especially useful for integers with missing values). +Notice that while pandas is forced to store the data as floating point, +the database supports nullable integers. When fetching the data with +Python, we get back integer scalars.
+>>> df = pd.DataFrame({"A": [1, None, 2]})
+>>> df
+ A
+0 1.0
+1 NaN
+2 2.0
+
>>> from sqlalchemy.types import Integer
+>>> df.to_sql(name='integers', con=engine, index=False,
+... dtype={"A": Integer()})
+3
+
>>> with engine.connect() as conn:
+... conn.execute(text("SELECT * FROM integers")).fetchall()
+[(1,), (None,), (2,)]
+
to_pickle
(
path
, compression='infer'
, protocol=5
, storage_options=None
)
Pickle (serialize) object to file.
path
+(str, path object, or file-like object)
+— String, path object (implementing os.PathLike[str]
), or file-likeobject implementing a binary write()
function. File path where
+the pickled object will be stored.
+compression
+(str or dict, default 'infer')
+— For on-the-fly compression of the output data. If 'infer' and 'path' ispath-like, then detect compression from the following extensions: '.gz',
+'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+(otherwise no compression).
+Set to None
for no compression.
+Can also be a dict with key 'method'
set
+to one of {'zip'
, 'gzip'
, 'bz2'
, 'zstd'
, 'xz'
, 'tar'
} and
+other key-value pairs are forwarded to
+zipfile.ZipFile
, gzip.GzipFile
,
+bz2.BZ2File
, zstandard.ZstdCompressor
, lzma.LZMAFile
or
+tarfile.TarFile
, respectively.
+As an example, the following could be passed for faster compression and to create
+a reproducible gzip archive:
+compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}
..tar
files.
+protocol
+(int)
+— Int which indicates which protocol should be used by the pickler,default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
+values are 0, 1, 2, 3, 4, 5. A negative value for the protocol
+parameter is equivalent to setting its value to HIGHEST_PROTOCOL.storage_options
+(dict, optional)
+— Extra options that make sense for a particular storage connection, e.g.host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+are forwarded to urllib.request.Request
as header options. For other
+URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+forwarded to fsspec.open
. Please see fsspec
and urllib
for more
+details, and for more examples on storage options refer here
+<https://pandas.pydata.org/docs/user_guide/io.html?
+highlight=storage_options#reading-writing-remote-files>
_.
+read_pickle : Load pickled pandas object (or any object) from file.DataFrame.to_hdf : Write DataFrame to an HDF5 file. +DataFrame.to_sql : Write DataFrame to a SQL database. +DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) # doctest: +SKIP>>> original_df # doctest: +SKIP
+ foo bar
+0 0 5
+1 1 6
+2 2 7
+3 3 8
+4 4 9
+>>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP
+
>>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
+>>> unpickled_df # doctest: +SKIP
+ foo bar
+0 0 5
+1 1 6
+2 2 7
+3 3 8
+4 4 9
+
to_clipboard
(
excel=True
, sep=None
, **kwargs
)
Copy object to the system clipboard.
Write a text representation of object to the system clipboard. +This can be pasted into Excel, for example.
+excel
+(bool, default True)
+— Produce output in a csv format for easy pasting into excel.sep
+(str, default ``'\t'``)
+— Field delimiter.**kwargs
+
+— These parameters will be passed to DataFrame.to_csv.DataFrame.to_csv : Write a DataFrame to a comma-separated values (csv) file. +read_clipboard : Read text from clipboard and pass to read_csv.
+Notes
+Requirements for your platform.
+xclip
, or xsel
(with PyQt4
modules)This method uses the processes developed for the package pyperclip
. A
+solution to render any output string format is given in the examples.
Copy the contents of a DataFrame to the clipboard.
>>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
+
>>> df.to_clipboard(sep=',') # doctest: +SKIP
+... # Wrote the following to the system clipboard:
+... # ,A,B,C
+... # 0,1,2,3
+... # 1,4,5,6
+
We can omit the index by passing the keyword index
and setting
+it to false.
>>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP
+... # Wrote the following to the system clipboard:
+... # A,B,C
+... # 1,2,3
+... # 4,5,6
+
Using the original pyperclip
package for any string output format.
.. code-block:: python
+import pyperclip + html = df.style.to_html() + pyperclip.copy(html)
+to_xarray
(
)
Return an xarray object from the pandas object.
Data in the pandas structure converted to Dataset if the object isa DataFrame, or a DataArray if the object is a Series.
+DataFrame.to_hdf : Write DataFrame to an HDF5 file.DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+Notes
+See the xarray docs <https://xarray.pydata.org/en/stable/>
__
>>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),... ('parrot', 'bird', 24.0, 2),
+... ('lion', 'mammal', 80.5, 4),
+... ('monkey', 'mammal', np.nan, 4)],
+... columns=['name', 'class', 'max_speed',
+... 'num_legs'])
+>>> df
+ name class max_speed num_legs
+0 falcon bird 389.0 2
+1 parrot bird 24.0 2
+2 lion mammal 80.5 4
+3 monkey mammal NaN 4
+
>>> df.to_xarray() # doctest: +SKIP
+<xarray.Dataset>
+Dimensions: (index: 4)
+Coordinates:
+ * index (index) int64 32B 0 1 2 3
+Data variables:
+ name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey'
+ class (index) object 32B 'bird' 'bird' 'mammal' 'mammal'
+ max_speed (index) float64 32B 389.0 24.0 80.5 nan
+ num_legs (index) int64 32B 2 2 4 4
+
>>> df['max_speed'].to_xarray() # doctest: +SKIP
+<xarray.DataArray 'max_speed' (index: 4)>
+array([389. , 24. , 80.5, nan])
+Coordinates:
+ * index (index) int64 0 1 2 3
+
>>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',
+... '2018-01-02', '2018-01-02'])
+>>> df_multiindex = pd.DataFrame({'date': dates,
+... 'animal': ['falcon', 'parrot',
+... 'falcon', 'parrot'],
+... 'speed': [350, 18, 361, 15]})
+>>> df_multiindex = df_multiindex.set_index(['date', 'animal'])
+
>>> df_multiindex
+ speed
+date animal
+2018-01-01 falcon 350
+ parrot 18
+2018-01-02 falcon 361
+ parrot 15
+
>>> df_multiindex.to_xarray() # doctest: +SKIP
+<xarray.Dataset>
+Dimensions: (date: 2, animal: 2)
+Coordinates:
+ * date (date) datetime64[ns] 2018-01-01 2018-01-02
+ * animal (animal) object 'falcon' 'parrot'
+Data variables:
+ speed (date, animal) int64 350 18 361 15
+
to_latex
(
buf=None
, columns=None
, header=True
, index=True
, na_rep='NaN'
, formatters=None
, float_format=None
, sparsify=None
, index_names=True
, bold_rows=False
, column_format=None
, longtable=None
, escape=None
, encoding=None
, decimal='.'
, multicolumn=None
, multicolumn_format=None
, multirow=None
, caption=None
, label=None
, position=None
)
Render object to a LaTeX tabular, longtable, or nested table.
Requires \usepackage{{booktabs}}
. The output can be copy/pasted
+into a main LaTeX document or read from an external file
+with \input{{table.tex}}
.
.. versionchanged:: 2.0.0 + Refactored to use the Styler implementation via jinja2 templating.
+buf
+(str, Path or StringIO-like, optional, default None)
+— Buffer to write to. If None, the output is returned as a string.columns
+(list of label, optional)
+— The subset of columns to write. Writes all columns by default.header
+(bool or list of str, default True)
+— Write out the column names. If a list of strings is given,it is assumed to be aliases for the column names.
+index
+(bool, default True)
+— Write row names (index).na_rep
+(str, default 'NaN')
+— Missing data representation.formatters
+(list of functions or dict of {{str: function}}, optional)
+— Formatter functions to apply to columns' elements by position orname. The result of each function must be a unicode string.
+List must be of length equal to the number of columns.
+float_format
+(one-parameter function or str, optional, default None)
+— Formatter for floating point numbers. For examplefloat_format="%.2f"
and float_format="{{:0.2f}}".format
will
+both result in 0.1234 being formatted as 0.12.
+sparsify
+(bool, optional)
+— Set to False for a DataFrame with a hierarchical index to printevery multiindex key at each row. By default, the value will be
+read from the config module.
+index_names
+(bool, default True)
+— Prints the names of the indexes.bold_rows
+(bool, default False)
+— Make the row labels bold in the output.column_format
+(str, optional)
+— The columns format as specified in LaTeX table format<https://en.wikibooks.org/wiki/LaTeX/Tables>
__ e.g. 'rcl' for 3
+columns. By default, 'l' will be used for all columns except
+columns of numbers, which default to 'r'.
+longtable
+(bool, optional)
+— Use a longtable environment instead of tabular. Requiresadding a \usepackage{{longtable}} to your LaTeX preamble.
+By default, the value will be read from the pandas config
+module, and set to True
if the option styler.latex.environment
is
+"longtable"
.escape
+(bool, optional)
+— By default, the value will be read from the pandas configmodule and set to True
if the option styler.format.escape
is
+"latex"
. When set to False prevents from escaping latex special
+characters in column names.False
.
+encoding
+(str, optional)
+— A string representing the encoding to use in the output file,defaults to 'utf-8'.
+decimal
+(str, default '.')
+— Character recognized as decimal separator, e.g. ',' in Europe.multicolumn
+(bool, default True)
+— Use \multicolumn to enhance MultiIndex columns.The default will be read from the config module, and is set
+as the option styler.sparse.columns
.multicolumn_format
+(str, default 'r')
+— The alignment for multicolumns, similar to column_format
The default will be read from the config module, and is set as the option
+styler.latex.multicol_align
.multirow
+(bool, default True)
+— Use \multirow to enhance MultiIndex rows. Requires adding a\usepackage{{multirow}} to your LaTeX preamble. Will print
+centered labels (instead of top-aligned) across the contained
+rows, separating groups via clines. The default will be read
+from the pandas config module, and is set as the option
+styler.sparse.index
.True
.
+caption
+(str or tuple, optional)
+— Tuple (full_caption, short_caption),which results in \caption[short_caption]{{full_caption}}
;
+if a single string is passed, no short caption will be set.
+label
+(str, optional)
+— The LaTeX label to be placed inside \label{{}}
in the output.This is used with \ref{{}}
in the main .tex
file.
+position
+(str, optional)
+— The LaTeX positional argument for tables, to be placed after\begin{{}}
in the output.
+If buf is None, returns the result as a string. Otherwise returns None.
io.formats.style.Styler.to_latex : Render a DataFrame to LaTeX with conditional formatting. +DataFrame.to_string : Render a DataFrame to a console-friendly + tabular output. +DataFrame.to_html : Render a DataFrame as an HTML table.
+Notes
+As of v2.0.0 this method has changed to use the Styler implementation as
+part of :meth:.Styler.to_latex
via jinja2
templating. This means
+that jinja2
is a requirement, and needs to be installed, for this method
+to function. It is advised that users switch to using Styler, since that
+implementation is more frequently updated and contains much more
+flexibility with the output.
Convert a general DataFrame to LaTeX with formatting:
>>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],
+... age=[26, 45],
+... height=[181.23, 177.65]))
+>>> print(df.to_latex(index=False,
+... formatters={"name": str.upper},
+... float_format="{:.1f}".format,
+... )) # doctest: +SKIP
+\begin{tabular}{lrr}
+\toprule
+name & age & height \\
+\midrule
+RAPHAEL & 26 & 181.2 \\
+DONATELLO & 45 & 177.7 \\
+\bottomrule
+\end{tabular}
+
to_csv
(
path_or_buf=None
, sep=','
, na_rep=''
, float_format=None
, columns=None
, header=True
, index=True
, index_label=None
, mode='w'
, encoding=None
, compression='infer'
, quoting=None
, quotechar='"'
, lineterminator=None
, chunksize=None
, date_format=None
, doublequote=True
, escapechar=None
, decimal='.'
, errors='strict'
, storage_options=None
)
Write object to a comma-separated values (csv) file.
path_or_buf
+(str, path object, file-like object, or None, default None)
+— String, path object (implementing os.PathLike[str]), or file-likeobject implementing a write() function. If None, the result is
+returned as a string. If a non-binary file object is passed, it should
+be opened with newline=''
, disabling universal newlines. If a binary
+file object is passed, mode
might need to contain a 'b'
.
+sep
+(str, default ',')
+— String of length 1. Field delimiter for the output file.na_rep
+(str, default '')
+— Missing data representation.float_format
+(str, Callable, default None)
+— Format string for floating point numbers. If a Callable is given, it takesprecedence over other numeric formatting parameters, like decimal.
+columns
+(sequence, optional)
+— Columns to write.header
+(bool or list of str, default True)
+— Write out the column names. If a list of strings is given it isassumed to be aliases for the column names.
+index
+(bool, default True)
+— Write row names (index).index_label
+(str or sequence, or False, default None)
+— Column label for index column(s) if desired. If None is given, andheader
and index
are True, then the index names are used. A
+sequence should be given if the object uses MultiIndex. If
+False do not print fields for index names. Use index_label=False
+for easier importing in R.
+mode
+({'w', 'x', 'a'}, default 'w')
+— Forwarded to either open(mode=)
or fsspec.open(mode=)
to controlthe file opening. Typical values include:encoding
+(str, optional)
+— A string representing the encoding to use in the output file,defaults to 'utf-8'. encoding
is not supported if path_or_buf
+is a non-binary file object.
+compression
+(str or dict, default 'infer')
+— For on-the-fly compression of the output data. If 'infer' and 'path_or_buf' ispath-like, then detect compression from the following extensions: '.gz',
+'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+(otherwise no compression).
+Set to None
for no compression.
+Can also be a dict with key 'method'
set
+to one of {'zip'
, 'gzip'
, 'bz2'
, 'zstd'
, 'xz'
, 'tar'
} and
+other key-value pairs are forwarded to
+zipfile.ZipFile
, gzip.GzipFile
,
+bz2.BZ2File
, zstandard.ZstdCompressor
, lzma.LZMAFile
or
+tarfile.TarFile
, respectively.
+As an example, the following could be passed for faster compression and to create
+a reproducible gzip archive:
+compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}
..tar
files.quoting
+(optional constant from csv module)
+— Defaults to csv.QUOTE_MINIMAL. If you have set a float_format
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
+will treat them as non-numeric.
+quotechar
+(str, default '\"')
+— String of length 1. Character used to quote fields.lineterminator
+(str, optional)
+— The newline character or character sequence to use in the outputfile. Defaults to os.linesep
, which depends on the OS in which
+this method is called ('\n' for linux, '\r\n' for Windows, i.e.).Previously was line_terminator, changed for consistency with
+read_csv and the standard library 'csv' module.
+
+chunksize
+(int or None)
+— Rows to write at a time.date_format
+(str, default None)
+— Format string for datetime objects.doublequote
+(bool, default True)
+— Control quoting of quotechar
inside a field.escapechar
+(str, default None)
+— String of length 1. Character used to escape sep
and quotechar
when appropriate.
+decimal
+(str, default '.')
+— Character recognized as decimal separator. E.g. use ',' forEuropean data.
+errors
+(str, default 'strict')
+— Specifies how encoding and decoding errors are to be handled.See the errors argument for :func:open
for a full list
+of options.
+storage_options
+(dict, optional)
+— Extra options that make sense for a particular storage connection, e.g.host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+are forwarded to urllib.request.Request
as header options. For other
+URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+forwarded to fsspec.open
. Please see fsspec
and urllib
for more
+details, and for more examples on storage options refer here
+<https://pandas.pydata.org/docs/user_guide/io.html?
+highlight=storage_options#reading-writing-remote-files>
_.
+If path_or_buf is None, returns the resulting csv format as astring. Otherwise returns None.
+read_csv : Load a CSV file into a DataFrame.to_excel : Write DataFrame to an Excel file.
+Create 'out.csv' containing 'df' without indices
>>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
+... 'mask': ['red', 'purple'],
+... 'weapon': ['sai', 'bo staff']})
+>>> df.to_csv('out.csv', index=False) # doctest: +SKIP
+
Create 'out.zip' containing 'out.csv'
+>>> df.to_csv(index=False)
+'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
+>>> compression_opts = dict(method='zip',
+... archive_name='out.csv') # doctest: +SKIP
+>>> df.to_csv('out.zip', index=False,
+... compression=compression_opts) # doctest: +SKIP
+
To write a csv file to a new folder or nested folder you will first +need to create it using either Pathlib or os:
+>>> from pathlib import Path # doctest: +SKIP
+>>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
+>>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
+>>> df.to_csv(filepath) # doctest: +SKIP
+
>>> import os # doctest: +SKIP
+>>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
+>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
+
take
(
indices
, axis=0
, **kwargs
)
Return the elements in the given positional indices along an axis.
This means that we are not indexing according to actual values in +the index attribute of the object. We are indexing according to the +actual position of the element in the object.
+indices
+(array-like)
+— An array of ints indicating which positions to take.axis
+({0 or 'index', 1 or 'columns', None}, default 0)
+— The axis on which to select elements. 0
means that we areselecting rows, 1
means that we are selecting columns.
+For Series
this parameter is unused and defaults to 0.
+**kwargs
+
+— For compatibility with :meth:numpy.take
. Has no effect on theoutput.
+An array-like containing the elements taken from the object.
DataFrame.loc : Select a subset of a DataFrame by labels.DataFrame.iloc : Select a subset of a DataFrame by positions. +numpy.take : Take elements from an array along an axis.
+>>> df = pd.DataFrame([('falcon', 'bird', 389.0),... ('parrot', 'bird', 24.0),
+... ('lion', 'mammal', 80.5),
+... ('monkey', 'mammal', np.nan)],
+... columns=['name', 'class', 'max_speed'],
+... index=[0, 2, 3, 1])
+>>> df
+ name class max_speed
+0 falcon bird 389.0
+2 parrot bird 24.0
+3 lion mammal 80.5
+1 monkey mammal NaN
+
Take elements at positions 0 and 3 along the axis 0 (default).
+Note how the actual indices selected (0 and 1) do not correspond to +our selected indices 0 and 3. That's because we are selecting the 0th +and 3rd rows, not rows whose indices equal 0 and 3.
+>>> df.take([0, 3])
+ name class max_speed
+0 falcon bird 389.0
+1 monkey mammal NaN
+
Take elements at indices 1 and 2 along the axis 1 (column selection).
+>>> df.take([1, 2], axis=1)
+ class max_speed
+0 bird 389.0
+2 bird 24.0
+3 mammal 80.5
+1 mammal NaN
+
We may take elements using negative integers for positive indices, +starting from the end of the object, just like with Python lists.
+>>> df.take([-1, -2])
+ name class max_speed
+1 monkey mammal NaN
+3 lion mammal 80.5
+
xs
(
key
, axis=0
, level=None
, drop_level=True
)
Return cross-section from the Series/DataFrame.
This method takes a key
argument to select data at a particular
+level of a MultiIndex.
key
+(label or tuple of label)
+— Label contained in the index, or partially in a MultiIndex.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Axis to retrieve cross-section on.level
+(object, defaults to first n levels (n=1 or len(key)))
+— In case of a key partially contained in a MultiIndex, indicatewhich levels are used. Levels can be referred by label or position.
+drop_level
+(bool, default True)
+— If False, returns object with same levels as self.Cross-section from the original Series or DataFramecorresponding to the selected index levels.
+DataFrame.loc : Access a group of rows and columns by label(s) or a boolean array. +DataFrame.iloc : Purely integer-location based indexing + for selection by position.
+Notes
+xs
can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
+any level or levels.
+It is a superset of xs
functionality, see
+:ref:MultiIndex Slicers <advanced.mi_slicers>
.
>>> d = {'num_legs': [4, 4, 2, 2],... 'num_wings': [0, 0, 2, 2],
+... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
+... 'animal': ['cat', 'dog', 'bat', 'penguin'],
+... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
+>>> df = pd.DataFrame(data=d)
+>>> df = df.set_index(['class', 'animal', 'locomotion'])
+>>> df
+ num_legs num_wings
+class animal locomotion
+mammal cat walks 4 0
+ dog walks 4 0
+ bat flies 2 2
+bird penguin walks 2 2
+
Get values at specified index
+>>> df.xs('mammal')
+ num_legs num_wings
+animal locomotion
+cat walks 4 0
+dog walks 4 0
+bat flies 2 2
+
Get values at several indexes
+>>> df.xs(('mammal', 'dog', 'walks'))
+num_legs 4
+num_wings 0
+Name: (mammal, dog, walks), dtype: int64
+
Get values at specified index and level
+>>> df.xs('cat', level=1)
+ num_legs num_wings
+class locomotion
+mammal walks 4 0
+
Get values at several indexes and levels
+>>> df.xs(('bird', 'walks'),
+... level=[0, 'locomotion'])
+ num_legs num_wings
+animal
+penguin 2 2
+
Get values at specified column and axis
+>>> df.xs('num_wings', axis=1)
+class animal locomotion
+mammal cat walks 0
+ dog walks 0
+ bat flies 2
+bird penguin walks 2
+Name: num_wings, dtype: int64
+
__delitem__
(
key
)
Delete item
get
(
key
, default=None
)
Get item from object for given key (ex: DataFrame column).
Returns default value if not found.
+>>> df = pd.DataFrame(... [
+... [24.3, 75.7, "high"],
+... [31, 87.8, "high"],
+... [22, 71.6, "medium"],
+... [35, 95, "medium"],
+... ],
+... columns=["temp_celsius", "temp_fahrenheit", "windspeed"],
+... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),
+... )
+
>>> df
+ temp_celsius temp_fahrenheit windspeed
+2014-02-12 24.3 75.7 high
+2014-02-13 31.0 87.8 high
+2014-02-14 22.0 71.6 medium
+2014-02-15 35.0 95.0 medium
+
>>> df.get(["temp_celsius", "windspeed"])
+ temp_celsius windspeed
+2014-02-12 24.3 high
+2014-02-13 31.0 high
+2014-02-14 22.0 medium
+2014-02-15 35.0 medium
+
>>> ser = df['windspeed']
+>>> ser.get('2014-02-13')
+'high'
+
If the key isn't found, the default value will be used.
+>>> df.get(["temp_celsius", "temp_kelvin"], default="default_value")
+'default_value'
+
>>> ser.get('2014-02-10', '[unknown]')
+'[unknown]'
+
reindex_like
(
other
, method=None
, copy=None
, limit=None
, tolerance=None
)
Return an object with matching indices as other object.
Conform the object to the same index on all axes. Optional +filling logic, placing NaN in locations having no value +in the previous index. A new object is produced unless the +new index is equivalent to the current one and copy=False.
+other
+(Object of the same data type)
+— Its row and column indices are used to define the new indicesof this object.
+method
+({None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'})
+— Method to use for filling holes in reindexed DataFrame.Please note: this is only applicable to DataFrames/Series with a
+monotonically increasing/decreasing index.copy
+(bool, default True)
+— Return a new object, even if the passed indexes are the same.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+limit
+(int, default None)
+— Maximum number of consecutive labels to fill for inexact matches.tolerance
+(optional)
+— Maximum distance between original and new labels for inexactmatches. The values of the index at the matching locations must
+satisfy the equation abs(index[indexer] - target) <= tolerance
.Same type as caller, but with changed indices on each axis.
DataFrame.set_index : Set row labels.DataFrame.reset_index : Remove row labels or move them to new columns. +DataFrame.reindex : Change to new indices or expand indices.
+Notes
+Same as calling
+.reindex(index=other.index, columns=other.columns,...)
.
>>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],... [31, 87.8, 'high'],
+... [22, 71.6, 'medium'],
+... [35, 95, 'medium']],
+... columns=['temp_celsius', 'temp_fahrenheit',
+... 'windspeed'],
+... index=pd.date_range(start='2014-02-12',
+... end='2014-02-15', freq='D'))
+
>>> df1
+ temp_celsius temp_fahrenheit windspeed
+2014-02-12 24.3 75.7 high
+2014-02-13 31.0 87.8 high
+2014-02-14 22.0 71.6 medium
+2014-02-15 35.0 95.0 medium
+
>>> df2 = pd.DataFrame([[28, 'low'],
+... [30, 'low'],
+... [35.1, 'medium']],
+... columns=['temp_celsius', 'windspeed'],
+... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
+... '2014-02-15']))
+
>>> df2
+ temp_celsius windspeed
+2014-02-12 28.0 low
+2014-02-13 30.0 low
+2014-02-15 35.1 medium
+
>>> df2.reindex_like(df1)
+ temp_celsius temp_fahrenheit windspeed
+2014-02-12 28.0 NaN low
+2014-02-13 30.0 NaN low
+2014-02-14 NaN NaN NaN
+2014-02-15 35.1 NaN medium
+
add_prefix
(
prefix
, axis=None
)
Prefix labels with string prefix
.
For Series, the row labels are prefixed. +For DataFrame, the column labels are prefixed.
+prefix
+(str)
+— The string to add before each label.axis
+({0 or 'index', 1 or 'columns', None}, default None)
+— Axis to add prefix onNew Series or DataFrame with updated labels.
Series.add_suffix: Suffix row labels with string suffix
.DataFrame.add_suffix: Suffix column labels with string suffix
.
>>> s = pd.Series([1, 2, 3, 4])>>> s
+0 1
+1 2
+2 3
+3 4
+dtype: int64
+
>>> s.add_prefix('item_')
+item_0 1
+item_1 2
+item_2 3
+item_3 4
+dtype: int64
+
>>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
+>>> df
+ A B
+0 1 3
+1 2 4
+2 3 5
+3 4 6
+
>>> df.add_prefix('col_')
+ col_A col_B
+0 1 3
+1 2 4
+2 3 5
+3 4 6
+
add_suffix
(
suffix
, axis=None
)
Suffix labels with string suffix
.
For Series, the row labels are suffixed. +For DataFrame, the column labels are suffixed.
+suffix
+(str)
+— The string to add after each label.axis
+({0 or 'index', 1 or 'columns', None}, default None)
+— Axis to add suffix onNew Series or DataFrame with updated labels.
Series.add_prefix: Prefix row labels with string prefix
.DataFrame.add_prefix: Prefix column labels with string prefix
.
>>> s = pd.Series([1, 2, 3, 4])>>> s
+0 1
+1 2
+2 3
+3 4
+dtype: int64
+
>>> s.add_suffix('_item')
+0_item 1
+1_item 2
+2_item 3
+3_item 4
+dtype: int64
+
>>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
+>>> df
+ A B
+0 1 3
+1 2 4
+2 3 5
+3 4 6
+
>>> df.add_suffix('_col')
+ A_col B_col
+0 1 3
+1 2 4
+2 3 5
+3 4 6
+
filter
(
items=None
, like=None
, regex=None
, axis=None
)
Subset the dataframe rows or columns according to the specified index labels.
Note that this routine does not filter a dataframe on its +contents. The filter is applied to the labels of the index.
+items
+(list-like)
+— Keep labels from axis which are in items.like
+(str)
+— Keep labels from axis for which "like in label == True".regex
+(str (regular expression))
+— Keep labels from axis for which re.search(regex, label) == True.axis
+({0 or 'index', 1 or 'columns', None}, default None)
+— The axis to filter on, expressed either as an index (int)or axis name (str). By default this is the info axis, 'columns' for
+DataFrame. For Series
this parameter is unused and defaults to None
.
+DataFrame.loc : Access a group of rows and columns by label(s) or a boolean array.
+Notes
+The items
, like
, and regex
parameters are
+enforced to be mutually exclusive.
axis
defaults to the info axis that is used when indexing
+with []
.
>>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),... index=['mouse', 'rabbit'],
+... columns=['one', 'two', 'three'])
+>>> df
+ one two three
+mouse 1 2 3
+rabbit 4 5 6
+
>>> # select columns by name
+>>> df.filter(items=['one', 'three'])
+ one three
+mouse 1 3
+rabbit 4 6
+
>>> # select columns by regular expression
+>>> df.filter(regex='e$', axis=1)
+ one three
+mouse 1 3
+rabbit 4 6
+
>>> # select rows containing 'bbi'
+>>> df.filter(like='bbi', axis=0)
+ one two three
+rabbit 4 5 6
+
head
(
n=5
)
Return the first n
rows.
This function returns the first n
rows for the object based
+on position. It is useful for quickly testing if your object
+has the right type of data in it.
For negative values of n
, this function returns all rows except
+the last |n|
rows, equivalent to df[:n]
.
If n is larger than the number of rows, this function returns all rows.
+n
+(int, default 5)
+— Number of rows to select.The first n
rows of the caller object.
DataFrame.tail: Returns the last n
rows.
>>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
+>>> df
+ animal
+0 alligator
+1 bee
+2 falcon
+3 lion
+4 monkey
+5 parrot
+6 shark
+7 whale
+8 zebra
+
Viewing the first 5 lines
+>>> df.head()
+ animal
+0 alligator
+1 bee
+2 falcon
+3 lion
+4 monkey
+
Viewing the first n
lines (three in this case)
>>> df.head(3)
+ animal
+0 alligator
+1 bee
+2 falcon
+
For negative values of n
>>> df.head(-3)
+ animal
+0 alligator
+1 bee
+2 falcon
+3 lion
+4 monkey
+5 parrot
+
tail
(
n=5
)
Return the last n
rows.
This function returns last n
rows from the object based on
+position. It is useful for quickly verifying data, for example,
+after sorting or appending rows.
For negative values of n
, this function returns all rows except
+the first |n|
rows, equivalent to df[|n|:]
.
If n is larger than the number of rows, this function returns all rows.
+n
+(int, default 5)
+— Number of rows to select.The last n
rows of the caller object.
DataFrame.head : The first n
rows of the caller object.
>>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
+>>> df
+ animal
+0 alligator
+1 bee
+2 falcon
+3 lion
+4 monkey
+5 parrot
+6 shark
+7 whale
+8 zebra
+
Viewing the last 5 lines
+>>> df.tail()
+ animal
+4 monkey
+5 parrot
+6 shark
+7 whale
+8 zebra
+
Viewing the last n
lines (three in this case)
>>> df.tail(3)
+ animal
+6 shark
+7 whale
+8 zebra
+
For negative values of n
>>> df.tail(-3)
+ animal
+3 lion
+4 monkey
+5 parrot
+6 shark
+7 whale
+8 zebra
+
sample
(
n=None
, frac=None
, replace=False
, weights=None
, random_state=None
, axis=None
, ignore_index=False
)
Return a random sample of items from an axis of object.
You can use random_state
for reproducibility.
n
+(int, optional)
+— Number of items from axis to return. Cannot be used with frac
.Default = 1 if frac
= None.
+frac
+(float, optional)
+— Fraction of axis items to return. Cannot be used with n
.replace
+(bool, default False)
+— Allow or disallow sampling of the same row more than once.weights
+(str or ndarray-like, optional)
+— Default 'None' results in equal probability weighting.If passed a Series, will align with target object on index. Index
+values in weights not found in sampled object will be ignored and
+index values in sampled object not in weights will be assigned
+weights of zero.
+If called on a DataFrame, will accept the name of a column
+when axis = 0.
+Unless weights are a Series, weights must be same length as axis
+being sampled.
+If weights do not sum to 1, they will be normalized to sum to 1.
+Missing values in the weights column will be treated as zero.
+Infinite values not allowed.
+random_state
+(int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional)
+— If int, array-like, or BitGenerator, seed for random number generator.If np.random.RandomState or np.random.Generator, use as given.np.random.Generator objects now accepted
+
+axis
+({0 or 'index', 1 or 'columns', None}, default None)
+— Axis to sample. Accepts axis number or name. Default is stat axisfor given data type. For Series
this parameter is unused and defaults to None
.
+ignore_index
+(bool, default False)
+— If True, the resulting index will be labeled 0, 1, …, n - 1.A new object of same type as caller containing n
items randomlysampled from the caller object.
DataFrameGroupBy.sample: Generates random samples from each group of a DataFrame object. +SeriesGroupBy.sample: Generates random samples from each group of a + Series object. +numpy.random.choice: Generates a random sample from a given 1-D numpy + array.
+Notes
+If frac
> 1, replacement
should be set to True
.
>>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],... 'num_wings': [2, 0, 0, 0],
+... 'num_specimen_seen': [10, 2, 1, 8]},
+... index=['falcon', 'dog', 'spider', 'fish'])
+>>> df
+ num_legs num_wings num_specimen_seen
+falcon 2 2 10
+dog 4 0 2
+spider 8 0 1
+fish 0 0 8
+
Extract 3 random elements from the Series
df['num_legs']
:
+Note that we use random_state
to ensure the reproducibility of
+the examples.
>>> df['num_legs'].sample(n=3, random_state=1)
+fish 0
+spider 8
+falcon 2
+Name: num_legs, dtype: int64
+
A random 50% sample of the DataFrame
with replacement:
>>> df.sample(frac=0.5, replace=True, random_state=1)
+ num_legs num_wings num_specimen_seen
+dog 4 0 2
+fish 0 0 8
+
An upsample sample of the DataFrame
with replacement:
+Note that replace
parameter has to be True
for frac
parameter > 1.
>>> df.sample(frac=2, replace=True, random_state=1)
+ num_legs num_wings num_specimen_seen
+dog 4 0 2
+fish 0 0 8
+falcon 2 2 10
+falcon 2 2 10
+fish 0 0 8
+dog 4 0 2
+fish 0 0 8
+dog 4 0 2
+
Using a DataFrame column as weights. Rows with larger value in the
+num_specimen_seen
column are more likely to be sampled.
>>> df.sample(n=2, weights='num_specimen_seen', random_state=1)
+ num_legs num_wings num_specimen_seen
+falcon 2 2 10
+fish 0 0 8
+
pipe
(
func
, *args
, **kwargs
)
Apply chainable functions that expect Series or DataFrames.
func
+(function)
+— Function to apply to the Series/DataFrame.args
, and kwargs
are passed into func
.
+Alternatively a (callable, data_keyword)
tuple where
+data_keyword
is a string indicating the keyword of
+callable
that expects the Series/DataFrame.
+*args
+(iterable, optional)
+— Positional arguments passed into func
.**kwargs
+(mapping, optional)
+— A dictionary of keyword arguments passed into func
.DataFrame.apply : Apply a function along input axis of DataFrame.DataFrame.map : Apply a function elementwise on a whole DataFrame.
+Series.map : Apply a mapping correspondence on a
+ :class:~pandas.Series
.
Notes
+Use .pipe
when chaining together functions that expect
+Series, DataFrames or GroupBy objects.
Constructing a income DataFrame from a dictionary.
>>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]]
+>>> df = pd.DataFrame(data, columns=['Salary', 'Others'])
+>>> df
+ Salary Others
+0 8000 1000.0
+1 9500 NaN
+2 5000 2000.0
+
Functions that perform tax reductions on an income DataFrame.
+>>> def subtract_federal_tax(df):
+... return df * 0.9
+>>> def subtract_state_tax(df, rate):
+... return df * (1 - rate)
+>>> def subtract_national_insurance(df, rate, rate_increase):
+... new_rate = rate + rate_increase
+... return df * (1 - new_rate)
+
Instead of writing
+>>> subtract_national_insurance(
+... subtract_state_tax(subtract_federal_tax(df), rate=0.12),
+... rate=0.05,
+... rate_increase=0.02) # doctest: +SKIP
+
You can write
+>>> (
+... df.pipe(subtract_federal_tax)
+... .pipe(subtract_state_tax, rate=0.12)
+... .pipe(subtract_national_insurance, rate=0.05, rate_increase=0.02)
+... )
+ Salary Others
+0 5892.48 736.56
+1 6997.32 NaN
+2 3682.80 1473.12
+
If you have a function that takes the data as (say) the second
+argument, pass a tuple indicating which keyword expects the
+data. For example, suppose national_insurance
takes its data as df
+in the second argument:
>>> def subtract_national_insurance(rate, df, rate_increase):
+... new_rate = rate + rate_increase
+... return df * (1 - new_rate)
+>>> (
+... df.pipe(subtract_federal_tax)
+... .pipe(subtract_state_tax, rate=0.12)
+... .pipe(
+... (subtract_national_insurance, 'df'),
+... rate=0.05,
+... rate_increase=0.02
+... )
+... )
+ Salary Others
+0 5892.48 736.56
+1 6997.32 NaN
+2 3682.80 1473.12
+
__finalize__
(
other
, method=None
, **kwargs
)
Propagate metadata from other to self.
other
+(the object from which to get the attributes that we are going)
+— to propagatemethod
+(str, optional)
+— A passed method name providing context on where __finalize__
was called.method
are not currently considered
+ stable across pandas releases.
+__getattr__
(
name
)
After regular attribute access, try looking up the nameThis allows simpler access to columns for interactive use.
+__setattr__
(
name
, value
)
After regular attribute access, try setting the nameThis allows simpler access to columns for interactive use.
+astype
(
dtype
, copy=None
, errors='raise'
)
Cast a pandas object to a specified dtype dtype
.
dtype
+(str, data type, Series or Mapping of column name -> data type)
+— Use a str, numpy.dtype, pandas.ExtensionDtype or Python type tocast entire pandas object to the same type. Alternatively, use a
+mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is
+a numpy.dtype or Python type to cast one or more of the DataFrame's
+columns to column-specific types.
+copy
+(bool, default True)
+— Return a copy when copy=True
(be very careful settingcopy=False
as changes to values then may propagate to other
+pandas objects).copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+errors
+({'raise', 'ignore'}, default 'raise')
+— Control raising of exceptions on invalid data for provided dtype.raise
: allow exceptions to be raisedignore
: suppress exceptions. On error return original object.to_datetime : Convert argument to datetime.to_timedelta : Convert argument to timedelta. +to_numeric : Convert argument to a numeric type. +numpy.ndarray.astype : Cast a numpy array to a specified type.
+Notes
+.. versionchanged:: 2.0.0
+Using ``astype`` to convert from timezone-naive dtype to
+timezone-aware dtype will raise an exception.
+Use :meth:`Series.dt.tz_localize` instead.
+
+Create a DataFrame:
>>> d = {'col1': [1, 2], 'col2': [3, 4]}
+>>> df = pd.DataFrame(data=d)
+>>> df.dtypes
+col1 int64
+col2 int64
+dtype: object
+
Cast all columns to int32:
+>>> df.astype('int32').dtypes
+col1 int32
+col2 int32
+dtype: object
+
Cast col1 to int32 using a dictionary:
+>>> df.astype({'col1': 'int32'}).dtypes
+col1 int32
+col2 int64
+dtype: object
+
Create a series:
+>>> ser = pd.Series([1, 2], dtype='int32')
+>>> ser
+0 1
+1 2
+dtype: int32
+>>> ser.astype('int64')
+0 1
+1 2
+dtype: int64
+
Convert to categorical type:
+>>> ser.astype('category')
+0 1
+1 2
+dtype: category
+Categories (2, int32): [1, 2]
+
Convert to ordered categorical type with custom ordering:
+>>> from pandas.api.types import CategoricalDtype
+>>> cat_dtype = CategoricalDtype(
+... categories=[2, 1], ordered=True)
+>>> ser.astype(cat_dtype)
+0 1
+1 2
+dtype: category
+Categories (2, int64): [2 < 1]
+
Create a series of dates:
+>>> ser_date = pd.Series(pd.date_range('20200101', periods=3))
+>>> ser_date
+0 2020-01-01
+1 2020-01-02
+2 2020-01-03
+dtype: datetime64[ns]
+
copy
(
deep=True
)
Make a copy of this object's indices and data.
When deep=True
(default), a new object will be created with a
+copy of the calling object's data and indices. Modifications to
+the data or indices of the copy will not be reflected in the
+original object (see notes below).
When deep=False
, a new object will be created without copying
+the calling object's data or index (only references to the data
+and index are copied). Any changes to the data of the original
+will be reflected in the shallow copy (and vice versa).
.. note::
+ The deep=False
behaviour as described above will change
+ in pandas 3.0. Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that the "shallow" copy
+ is that is returned with deep=False
will still avoid making
+ an eager copy, but changes to the data of the original will no
+ longer be reflected in the shallow copy (or vice versa). Instead,
+ it makes use of a lazy (deferred) copy mechanism that will copy
+ the data only when any changes to the original or shallow copy is
+ made.
You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+deep
+(bool, default True)
+— Make a deep copy, including a copy of the data and the indices.With deep=False
neither the indices nor the data are copied.
+Object type matches caller.
Notes
+When deep=True
, data is copied but actual Python objects
+will not be copied recursively, only the reference to the object.
+This is in contrast to copy.deepcopy
in the Standard Library,
+which recursively copies object data (see examples below).
While Index
objects are copied when deep=True
, the underlying
+numpy array is not copied for performance reasons. Since Index
is
+immutable, the underlying data can be safely shared and a copy
+is not needed.
Since pandas is not thread safe, see the
+:ref:gotchas <gotchas.thread-safety>
when copying in a threading
+environment.
When copy_on_write
in pandas config is set to True
, the
+copy_on_write
config takes effect even when deep=False
.
+This means that any changes to the copied data would make a new copy
+of the data upon write (and vice versa). Changes made to either the
+original or copied variable would not be reflected in the counterpart.
+See :ref:Copy_on_Write <copy_on_write>
for more information.
>>> s = pd.Series([1, 2], index=["a", "b"])>>> s
+a 1
+b 2
+dtype: int64
+
>>> s_copy = s.copy()
+>>> s_copy
+a 1
+b 2
+dtype: int64
+
Shallow copy versus default (deep) copy:
+>>> s = pd.Series([1, 2], index=["a", "b"])
+>>> deep = s.copy()
+>>> shallow = s.copy(deep=False)
+
Shallow copy shares data and index with original.
+>>> s is shallow
+False
+>>> s.values is shallow.values and s.index is shallow.index
+True
+
Deep copy has own copy of data and index.
+>>> s is deep
+False
+>>> s.values is deep.values or s.index is deep.index
+False
+
Updates to the data shared by shallow copy and original is reflected +in both (NOTE: this will no longer be true for pandas >= 3.0); +deep copy remains unchanged.
+>>> s.iloc[0] = 3
+>>> shallow.iloc[1] = 4
+>>> s
+a 3
+b 4
+dtype: int64
+>>> shallow
+a 3
+b 4
+dtype: int64
+>>> deep
+a 1
+b 2
+dtype: int64
+
Note that when copying an object containing Python objects, a deep copy +will copy the data, but will not do so recursively. Updating a nested +data object will be reflected in the deep copy.
+>>> s = pd.Series([[1, 2], [3, 4]])
+>>> deep = s.copy()
+>>> s[0][0] = 10
+>>> s
+0 [10, 2]
+1 [3, 4]
+dtype: object
+>>> deep
+0 [10, 2]
+1 [3, 4]
+dtype: object
+
Copy-on-Write is set to true, the shallow copy is not modified +when the original data is changed:
+>>> with pd.option_context("mode.copy_on_write", True):
+... s = pd.Series([1, 2], index=["a", "b"])
+... copy = s.copy(deep=False)
+... s.iloc[0] = 100
+... s
+a 100
+b 2
+dtype: int64
+>>> copy
+a 1
+b 2
+dtype: int64
+
__deepcopy__
(
memo=None
)
infer_objects
(
copy=None
)
Attempt to infer better dtypes for object columns.
Attempts soft conversion of object-dtyped +columns, leaving non-object and unconvertible +columns unchanged. The inference rules are the +same as during normal Series/DataFrame construction.
+copy
+(bool, default True)
+— Whether to make a copy for non-object or non-inferable columnsor Series.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+to_datetime : Convert argument to datetime.to_timedelta : Convert argument to timedelta. +to_numeric : Convert argument to numeric type. +convert_dtypes : Convert argument to best possible dtype.
+>>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})>>> df = df.iloc[1:]
+>>> df
+ A
+1 1
+2 2
+3 3
+
>>> df.dtypes
+A object
+dtype: object
+
>>> df.infer_objects().dtypes
+A int64
+dtype: object
+
convert_dtypes
(
infer_objects=True
, convert_string=True
, convert_integer=True
, convert_boolean=True
, convert_floating=True
, dtype_backend='numpy_nullable'
)
Convert columns to the best possible dtypes using dtypes supporting pd.NA
.
infer_objects
+(bool, default True)
+— Whether object dtypes should be converted to the best possible types.convert_string
+(bool, default True)
+— Whether object dtypes should be converted to StringDtype()
.convert_integer
+(bool, default True)
+— Whether, if possible, conversion can be done to integer extension types.convert_boolean
+(bool, defaults True)
+— Whether object dtypes should be converted to BooleanDtypes()
.convert_floating
+(bool, defaults True)
+— Whether, if possible, conversion can be done to floating extension types.If convert_integer
is also True, preference will be give to integer
+dtypes if the floats can be faithfully casted to integers.
+dtype_backend
+({'numpy_nullable', 'pyarrow'}, default 'numpy_nullable')
+— Back-end data type applied to the resultant :class:DataFrame
(still experimental). Behaviour is as follows:"numpy_nullable"
: returns nullable-dtype-backed :class:DataFrame
+ (default)."pyarrow"
: returns pyarrow-backed nullable :class:ArrowDtype
+ DataFrame.Copy of input object with new dtype.
infer_objects : Infer dtypes of objects.to_datetime : Convert argument to datetime. +to_timedelta : Convert argument to timedelta. +to_numeric : Convert argument to a numeric type.
+Notes
+By default, convert_dtypes
will attempt to convert a Series (or each
+Series in a DataFrame) to dtypes that support pd.NA
. By using the options
+convert_string
, convert_integer
, convert_boolean
and
+convert_floating
, it is possible to turn off individual conversions
+to StringDtype
, the integer extension types, BooleanDtype
+or floating extension types, respectively.
For object-dtyped columns, if infer_objects
is True
, use the inference
+rules as during normal Series/DataFrame construction. Then, if possible,
+convert to StringDtype
, BooleanDtype
or an appropriate integer
+or floating extension type, otherwise leave as object
.
If the dtype is integer, convert to an appropriate integer extension type.
+If the dtype is numeric, and consists of all integers, convert to an +appropriate integer extension type. Otherwise, convert to an +appropriate floating extension type.
+In the future, as new dtypes are added that support pd.NA
, the results
+of this method will change to support those new dtypes.
>>> df = pd.DataFrame(... {
+... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
+... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
+... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
+... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
+... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
+... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
+... }
+... )
+
Start with a DataFrame with default dtypes.
+>>> df
+ a b c d e f
+0 1 x True h 10.0 NaN
+1 2 y False i NaN 100.5
+2 3 z NaN NaN 20.0 200.0
+
>>> df.dtypes
+a int32
+b object
+c object
+d object
+e float64
+f float64
+dtype: object
+
Convert the DataFrame to use best possible dtypes.
+>>> dfn = df.convert_dtypes()
+>>> dfn
+ a b c d e f
+0 1 x True h 10 <NA>
+1 2 y False i <NA> 100.5
+2 3 z <NA> <NA> 20 200.0
+
>>> dfn.dtypes
+a Int32
+b string[python]
+c boolean
+d string[python]
+e Int64
+f Float64
+dtype: object
+
Start with a Series of strings and missing data represented by np.nan
.
>>> s = pd.Series(["a", "b", np.nan])
+>>> s
+0 a
+1 b
+2 NaN
+dtype: object
+
Obtain a Series with dtype StringDtype
.
>>> s.convert_dtypes()
+0 a
+1 b
+2 <NA>
+dtype: string
+
fillna
(
value=None
, method=None
, axis=None
, inplace=False
, limit=None
, downcast=<no_default>
)
Fill NA/NaN values using the specified method.
value
+(scalar, dict, Series, or DataFrame)
+— Value to use to fill holes (e.g. 0), alternately adict/Series/DataFrame of values specifying which value to use for
+each index (for a Series) or column (for a DataFrame). Values not
+in the dict/Series/DataFrame will not be filled. This value cannot
+be a list.
+method
+({'backfill', 'bfill', 'ffill', None}, default None)
+— Method to use for filling holes in reindexed Series:axis
+({0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame)
+— Axis along which to fill missing values. For Series
this parameter is unused and defaults to 0.
+inplace
+(bool, default False)
+— If True, fill in-place. Note: this will modify anyother views on this object (e.g., a no-copy slice for a column in a
+DataFrame).
+limit
+(int, default None)
+— If method is specified, this is the maximum number of consecutiveNaN values to forward/backward fill. In other words, if there is
+a gap with more than this number of consecutive NaNs, it will only
+be partially filled. If method is not specified, this is the
+maximum number of entries along the entire axis where NaNs will be
+filled. Must be greater than 0 if not None.
+downcast
+(dict, default is None)
+— A dict of item->dtype of what to downcast if possible,or the string 'infer' which will try to downcast to an appropriate
+equal type (e.g. float64 to int64 if possible).Object with missing values filled or None if inplace=True
.
ffill : Fill values by propagating the last valid observation to next valid.bfill : Fill values by using the next valid observation to fill the gap. +interpolate : Fill NaN values using interpolation. +reindex : Conform object to new index. +asfreq : Convert TimeSeries to specified frequency.
+>>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],... [3, 4, np.nan, 1],
+... [np.nan, np.nan, np.nan, np.nan],
+... [np.nan, 3, np.nan, 4]],
+... columns=list("ABCD"))
+>>> df
+ A B C D
+0 NaN 2.0 NaN 0.0
+1 3.0 4.0 NaN 1.0
+2 NaN NaN NaN NaN
+3 NaN 3.0 NaN 4.0
+
Replace all NaN elements with 0s.
+>>> df.fillna(0)
+ A B C D
+0 0.0 2.0 0.0 0.0
+1 3.0 4.0 0.0 1.0
+2 0.0 0.0 0.0 0.0
+3 0.0 3.0 0.0 4.0
+
Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1, +2, and 3 respectively.
+>>> values = {"A": 0, "B": 1, "C": 2, "D": 3}
+>>> df.fillna(value=values)
+ A B C D
+0 0.0 2.0 2.0 0.0
+1 3.0 4.0 2.0 1.0
+2 0.0 1.0 2.0 3.0
+3 0.0 3.0 2.0 4.0
+
Only replace the first NaN element.
+>>> df.fillna(value=values, limit=1)
+ A B C D
+0 0.0 2.0 2.0 0.0
+1 3.0 4.0 NaN 1.0
+2 NaN 1.0 NaN 3.0
+3 NaN 3.0 NaN 4.0
+
When filling using a DataFrame, replacement happens along +the same column names and same indices
+>>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
+>>> df.fillna(df2)
+ A B C D
+0 0.0 2.0 0.0 0.0
+1 3.0 4.0 0.0 1.0
+2 0.0 0.0 0.0 NaN
+3 0.0 3.0 0.0 4.0
+
Note that column D is not affected since it is not present in df2.
+ffill
(
axis=None
, inplace=False
, limit=None
, limit_area=None
, downcast=<no_default>
)
Fill NA/NaN values by propagating the last valid observation to next valid.
axis
+({0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame)
+— Axis along which to fill missing values. For Series
this parameter is unused and defaults to 0.
+inplace
+(bool, default False)
+— If True, fill in-place. Note: this will modify anyother views on this object (e.g., a no-copy slice for a column in a
+DataFrame).
+limit
+(int, default None)
+— If method is specified, this is the maximum number of consecutiveNaN values to forward/backward fill. In other words, if there is
+a gap with more than this number of consecutive NaNs, it will only
+be partially filled. If method is not specified, this is the
+maximum number of entries along the entire axis where NaNs will be
+filled. Must be greater than 0 if not None.
+limit_area
+({`None`, 'inside', 'outside'}, default None)
+— If limit is specified, consecutive NaNs will be filled with thisrestriction.None
: No fill restriction.downcast
+(dict, default is None)
+— A dict of item->dtype of what to downcast if possible,or the string 'infer' which will try to downcast to an appropriate
+equal type (e.g. float64 to int64 if possible).Object with missing values filled or None if inplace=True
.
>>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],... [3, 4, np.nan, 1],
+... [np.nan, np.nan, np.nan, np.nan],
+... [np.nan, 3, np.nan, 4]],
+... columns=list("ABCD"))
+>>> df
+ A B C D
+0 NaN 2.0 NaN 0.0
+1 3.0 4.0 NaN 1.0
+2 NaN NaN NaN NaN
+3 NaN 3.0 NaN 4.0
+
>>> df.ffill()
+ A B C D
+0 NaN 2.0 NaN 0.0
+1 3.0 4.0 NaN 1.0
+2 3.0 4.0 NaN 1.0
+3 3.0 3.0 NaN 4.0
+
>>> ser = pd.Series([1, np.nan, 2, 3])
+>>> ser.ffill()
+0 1.0
+1 1.0
+2 2.0
+3 3.0
+dtype: float64
+
pad
(
axis=None
, inplace=False
, limit=None
, downcast=<no_default>
)
Fill NA/NaN values by propagating the last valid observation to next valid.
.. deprecated:: 2.0
+Series/DataFrame.pad is deprecated. Use Series/DataFrame.ffill instead.
+
+Object with missing values filled or None if inplace=True
.
Please see examples for :meth:DataFrame.ffill
or :meth:Series.ffill
.
bfill
(
axis=None
, inplace=False
, limit=None
, limit_area=None
, downcast=<no_default>
)
Fill NA/NaN values by using the next valid observation to fill the gap.
axis
+({0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame)
+— Axis along which to fill missing values. For Series
this parameter is unused and defaults to 0.
+inplace
+(bool, default False)
+— If True, fill in-place. Note: this will modify anyother views on this object (e.g., a no-copy slice for a column in a
+DataFrame).
+limit
+(int, default None)
+— If method is specified, this is the maximum number of consecutiveNaN values to forward/backward fill. In other words, if there is
+a gap with more than this number of consecutive NaNs, it will only
+be partially filled. If method is not specified, this is the
+maximum number of entries along the entire axis where NaNs will be
+filled. Must be greater than 0 if not None.
+limit_area
+({`None`, 'inside', 'outside'}, default None)
+— If limit is specified, consecutive NaNs will be filled with thisrestriction.None
: No fill restriction.downcast
+(dict, default is None)
+— A dict of item->dtype of what to downcast if possible,or the string 'infer' which will try to downcast to an appropriate
+equal type (e.g. float64 to int64 if possible).Object with missing values filled or None if inplace=True
.
For Series:
>>> s = pd.Series([1, None, None, 2])
+>>> s.bfill()
+0 1.0
+1 2.0
+2 2.0
+3 2.0
+dtype: float64
+>>> s.bfill(limit=1)
+0 1.0
+1 NaN
+2 2.0
+3 2.0
+dtype: float64
+
With DataFrame:
+>>> df = pd.DataFrame({'A': [1, None, None, 4], 'B': [None, 5, None, 7]})
+>>> df
+ A B
+0 1.0 NaN
+1 NaN 5.0
+2 NaN NaN
+3 4.0 7.0
+>>> df.bfill()
+ A B
+0 1.0 5.0
+1 4.0 5.0
+2 4.0 7.0
+3 4.0 7.0
+>>> df.bfill(limit=1)
+ A B
+0 1.0 5.0
+1 NaN 5.0
+2 4.0 7.0
+3 4.0 7.0
+
backfill
(
axis=None
, inplace=False
, limit=None
, downcast=<no_default>
)
Fill NA/NaN values by using the next valid observation to fill the gap.
.. deprecated:: 2.0
+Series/DataFrame.backfill is deprecated. Use Series/DataFrame.bfill instead.
+
+Object with missing values filled or None if inplace=True
.
Please see examples for :meth:DataFrame.bfill
or :meth:Series.bfill
.
replace
(
to_replace=None
, value=<no_default>
, inplace=False
, limit=None
, regex=False
, method=<no_default>
)
Replace values given in to_replace
with value
.
Values of the Series/DataFrame are replaced with other values dynamically.
+This differs from updating with .loc
or .iloc
, which require
+you to specify a location to update with some value.
to_replace
+(str, regex, list, dict, Series, int, float, or None)
+— How to find the values that will be replaced.to_replace
will be
+ replaced with value
to_replace
will be replaced
+ with value
to_replace
will be replaced with
+ value
to_replace
and value
are both lists, they
+ must be the same length.regex=True
then all of the strings in both
+ lists will be interpreted as regexs otherwise they will match
+ directly. This doesn't matter much for value
since there
+ are only a few possible substitution regexes you can use.{'a': 'b', 'y': 'z'}
replaces the value 'a' with 'b' and
+ 'y' with 'z'. To use a dict in this way, the optional value
+ parameter should not be given.{'a': 1, 'b': 'z'}
looks for the value 1 in column 'a'
+ and the value 'z' in column 'b' and replaces these values
+ with whatever is specified in value
. The value
parameter
+ should not be None
in this case. You can treat this as a
+ special case of passing two lists except that you are
+ specifying the column to search in.{'a': {'b': np.nan}}
, are read as follows: look in column
+ 'a' for the value 'b' and replace it with NaN. The optional value
+ parameter should not be specified to use a nested dict in this
+ way. You can nest regular expressions as well. Note that
+ column names (the top-level dictionary keys in a nested
+ dictionary) cannot be regular expressions.regex
argument must be a string,
+ compiled regular expression, or list, dict, ndarray or
+ Series of such elements. If value
is also None
then
+ this must be a nested dictionary or Series.value
+(scalar, dict, list, str, regex, default None)
+— Value to replace any values matching to_replace
with.For a DataFrame a dict of values can be used to specify which
+value to use for each column (columns not in the dict will not be
+filled). Regular expressions, strings and lists or dicts of such
+objects are also allowed.
+inplace
+(bool, default False)
+— If True, performs operation inplace and returns None.limit
+(int, default None)
+— Maximum size gap to forward or backward fill.regex
+(bool or same types as `to_replace`, default False)
+— Whether to interpret to_replace
and/or value
as regularexpressions. Alternatively, this could be a regular expression or a
+list, dict, or array of regular expressions in which case
+to_replace
must be None
.
+method
+({'pad', 'ffill', 'bfill'})
+— The method to use when for replacement, when to_replace
is ascalar, list or tuple and value
is None
.Object after replacement.
AssertionError
+
+— regex
is not a bool
and to_replace
is not
+ None
.TypeError
+
+— to_replace
is not a scalar, array-like, dict
, or None
to_replace
is a dict
and value
is not a list
,
+ dict
, ndarray
, or Series
to_replace
is None
and regex
is not compilable
+ into a regular expression or is a list, dict, ndarray, or
+ Series.bool
or datetime64
objects and
+ the arguments to to_replace
does not match the type of the
+ value being replacedValueError
+
+— list
or an ndarray
is passed to to_replace
and
+ value
but they are not the same length.Series.fillna : Fill NA values.DataFrame.fillna : Fill NA values. +Series.where : Replace values based on boolean condition. +DataFrame.where : Replace values based on boolean condition. +DataFrame.map: Apply a function to a Dataframe elementwise. +Series.map: Map values of Series according to an input mapping or function. +Series.str.replace : Simple string replacement.
+Notes
+re.sub
. The
+ rules for substitution for re.sub
are the same.to_replace
value, it is like
+ key(s) in the dict are the to_replace part and
+ value(s) in the dict are the value parameter.*
) +) +5 +2 +3 +4 +5 +4
+, +, +) +) +C +a +b +c +d +e
+*
+) +C +a +b +c +d +e
+) +C +a +b +c +d +e
+) +3 +3 +3 +4 +5 +4
+*
+) +C +a +b +c +d +e
+) +C +a +b +c +d +e
+) +C +a +b +c +d +e
+*
+, +) +) +B +c +w +z
+) +B +c +r +z
+) +B +c +w +z
+) +B +c +w +z
+) +B +c +w +z
+d +s +:
+)
+e +. +o +:
+) +0 +e +e +b +e +t
+t +e +0 +.
+) +0 +0 +0 +b +b +t
+0 +.
+l +:
+) +0 +e +e +b +e +t
+0 +.
+, +.
+, +, +)
+) +C +e +e +h +i +j
+y +.
+) +C +f +g +e +e +e
+interpolate
(
method='linear'
, axis=0
, limit=None
, inplace=False
, limit_direction=None
, limit_area=None
, downcast=<no_default>
, **kwargs
)
Fill NaN values using an interpolation method.
Please note that only method='linear'
is supported for
+DataFrame/Series with a MultiIndex.
method
+(str, default 'linear')
+— Interpolation technique to use. One of:scipy.interpolate.interp1d
, whereas 'spline' is passed to
+ scipy.interpolate.UnivariateSpline
. These methods use the numerical
+ values of the index. Both 'polynomial' and 'spline' require that
+ you also specify an order
(int), e.g.
+ df.interpolate(method='polynomial', order=5)
. Note that,
+ slinear
method in Pandas refers to the Scipy first order spline
+ instead of Pandas first order spline
.Notes
.scipy.interpolate.BPoly.from_derivatives
.axis
+({{0 or 'index', 1 or 'columns', None}}, default None)
+— Axis to interpolate along. For Series
this parameter is unusedand defaults to 0.
+limit
+(int, optional)
+— Maximum number of consecutive NaNs to fill. Must be greater than0.
+inplace
+(bool, default False)
+— Update the data in place if possible.limit_direction
+({{'forward', 'backward', 'both'}}, Optional)
+— Consecutive NaNs will be filled in this direction.limit_direction
is 'forward' or 'both' and
+ method is 'backfill' or 'bfill'.
+raises ValueError if limit_direction
is 'backward' or 'both' and
+ method is 'pad' or 'ffill'.
+limit_area
+({{`None`, 'inside', 'outside'}}, default None)
+— If limit is specified, consecutive NaNs will be filled with thisrestriction.None
: No fill restriction.downcast
+(optional, 'infer' or None, defaults to None)
+— Downcast dtypes if possible.Returns the same object type as the caller, interpolated atsome or all NaN
values or None if inplace=True
.
fillna : Fill missing values using different methods.scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials + (Akima interpolator). +scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the + Bernstein basis. +scipy.interpolate.interp1d : Interpolate a 1-D function. +scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh + interpolator). +scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic + interpolation. +scipy.interpolate.CubicSpline : Cubic spline data interpolator.
+Notes
+The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
+methods are wrappers around the respective SciPy implementations of
+similar names. These use the actual numerical values of the index.
+For more information on their behavior, see the
+SciPy documentation
+<https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>
__.
Filling in NaN
in a :class:~pandas.Series
via linearinterpolation.
>>> s = pd.Series([0, 1, np.nan, 3])
+>>> s
+0 0.0
+1 1.0
+2 NaN
+3 3.0
+dtype: float64
+>>> s.interpolate()
+0 0.0
+1 1.0
+2 2.0
+3 3.0
+dtype: float64
+
Filling in NaN
in a Series via polynomial interpolation or splines:
+Both 'polynomial' and 'spline' methods require that you also specify
+an order
(int).
>>> s = pd.Series([0, 2, np.nan, 8])
+>>> s.interpolate(method='polynomial', order=2)
+0 0.000000
+1 2.000000
+2 4.666667
+3 8.000000
+dtype: float64
+
Fill the DataFrame forward (that is, going down) along each column +using linear interpolation.
+Note how the last entry in column 'a' is interpolated differently,
+because there is no entry after it to use for interpolation.
+Note how the first entry in column 'b' remains NaN
, because there
+is no entry before it to use for interpolation.
>>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),
+... (np.nan, 2.0, np.nan, np.nan),
+... (2.0, 3.0, np.nan, 9.0),
+... (np.nan, 4.0, -4.0, 16.0)],
+... columns=list('abcd'))
+>>> df
+ a b c d
+0 0.0 NaN -1.0 1.0
+1 NaN 2.0 NaN NaN
+2 2.0 3.0 NaN 9.0
+3 NaN 4.0 -4.0 16.0
+>>> df.interpolate(method='linear', limit_direction='forward', axis=0)
+ a b c d
+0 0.0 NaN -1.0 1.0
+1 1.0 2.0 -2.0 5.0
+2 2.0 3.0 -3.0 9.0
+3 2.0 4.0 -4.0 16.0
+
Using polynomial interpolation.
+>>> df['d'].interpolate(method='polynomial', order=2)
+0 1.0
+1 4.0
+2 9.0
+3 16.0
+Name: d, dtype: float64
+
asof
(
where
, subset=None
)
Return the last row(s) without any NaNs before where
.
The last row (for each element in where
, if list) without any
+NaN is taken.
+In case of a :class:~pandas.DataFrame
, the last row without NaN
+considering only the subset of columns (if not None
)
If there is no good value, NaN is returned for a Series or +a Series of NaN values for a DataFrame
+where
+(date or array-like of dates)
+— Date(s) before which the last row(s) are returned.subset
+(str or array-like of str, default `None`)
+— For DataFrame, if not None
, only use these columns tocheck for NaNs.
+:
r +, +r +n +e
+merge_asof : Perform an asof merge. Similar to left join.
Notes
+Dates are assumed to be sorted. Raises if this is not the case.
+A Series and a scalar where
.
>>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
+>>> s
+10 1.0
+20 2.0
+30 NaN
+40 4.0
+dtype: float64
+
>>> s.asof(20)
+2.0
+
For a sequence where
, a Series is returned. The first value is
+NaN, because the first element of where
is before the first
+index value.
>>> s.asof([5, 20])
+5 NaN
+20 2.0
+dtype: float64
+
Missing values are not considered. The following is 2.0
, not
+NaN, even though NaN is at the index location for 30
.
>>> s.asof(30)
+2.0
+
Take all columns into consideration
+>>> df = pd.DataFrame({'a': [10., 20., 30., 40., 50.],
+... 'b': [None, None, None, None, 500]},
+... index=pd.DatetimeIndex(['2018-02-27 09:01:00',
+... '2018-02-27 09:02:00',
+... '2018-02-27 09:03:00',
+... '2018-02-27 09:04:00',
+... '2018-02-27 09:05:00']))
+>>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
+... '2018-02-27 09:04:30']))
+ a b
+2018-02-27 09:03:30 NaN NaN
+2018-02-27 09:04:30 NaN NaN
+
Take a single column into consideration
+>>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
+... '2018-02-27 09:04:30']),
+... subset=['a'])
+ a b
+2018-02-27 09:03:30 30.0 NaN
+2018-02-27 09:04:30 40.0 NaN
+
clip
(
lower=None
, upper=None
, axis=None
, inplace=False
, **kwargs
)
Trim values at input threshold(s).
Assigns values outside boundary to boundary values. Thresholds +can be singular values or array like, and in the latter case +the clipping is performed element-wise in the specified axis.
+lower
+(float or array-like, default None)
+— Minimum threshold value. All values below thisthreshold will be set to it. A missing
+threshold (e.g NA
) will not clip the value.
+upper
+(float or array-like, default None)
+— Maximum threshold value. All values above thisthreshold will be set to it. A missing
+threshold (e.g NA
) will not clip the value.
+axis
+({{0 or 'index', 1 or 'columns', None}}, default None)
+— Align object with lower and upper along the given axis.For Series
this parameter is unused and defaults to None
.
+inplace
+(bool, default False)
+— Whether to perform the operation in place on the data.Same type as calling object with the values outside theclip boundaries replaced or None if inplace=True
.
Series.clip : Trim values at input threshold in series.DataFrame.clip : Trim values at input threshold in dataframe. +numpy.clip : Clip (limit) the values in an array.
+>>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}>>> df = pd.DataFrame(data)
+>>> df
+ col_0 col_1
+0 9 -2
+1 -3 -7
+2 0 6
+3 -1 8
+4 5 -5
+
Clips per column using lower and upper thresholds:
+>>> df.clip(-4, 6)
+ col_0 col_1
+0 6 -2
+1 -3 -4
+2 0 6
+3 -1 6
+4 5 -4
+
Clips using specific lower and upper thresholds per column:
+>>> df.clip([-2, -1], [4, 5])
+ col_0 col_1
+0 4 -1
+1 -2 -1
+2 0 5
+3 -1 5
+4 4 -1
+
Clips using specific lower and upper thresholds per column element:
+>>> t = pd.Series([2, -4, -1, 6, 3])
+>>> t
+0 2
+1 -4
+2 -1
+3 6
+4 3
+dtype: int64
+
>>> df.clip(t, t + 4, axis=0)
+ col_0 col_1
+0 6 2
+1 -3 -4
+2 0 3
+3 6 8
+4 5 3
+
Clips using specific lower threshold per column element, with missing values:
+>>> t = pd.Series([2, -4, np.nan, 6, 3])
+>>> t
+0 2.0
+1 -4.0
+2 NaN
+3 6.0
+4 3.0
+dtype: float64
+
>>> df.clip(t, axis=0)
+col_0 col_1
+0 9 2
+1 -3 -4
+2 0 6
+3 6 8
+4 5 3
+
asfreq
(
freq
, method=None
, how=None
, normalize=False
, fill_value=None
)
Convert time series to specified frequency.
Returns the original data conformed to a new index with the specified +frequency.
+If the index of this Series/DataFrame is a :class:~pandas.PeriodIndex
, the new index
+is the result of transforming the original index with
+:meth:PeriodIndex.asfreq <pandas.PeriodIndex.asfreq>
(so the original index
+will map one-to-one to the new index).
Otherwise, the new index will be equivalent to pd.date_range(start, end,
+freq=freq)
where start
and end
are, respectively, the first and
+last entries in the original index (see :func:pandas.date_range
). The
+values corresponding to any timesteps in the new index which were not present
+in the original index will be null (NaN
), unless a method for filling
+such unknowns is provided (see the method
parameter below).
The :meth:resample
method is more appropriate if an operation on each group of
+timesteps (such as an aggregate) is necessary to represent the data at the new
+frequency.
freq
+(DateOffset or str)
+— Frequency DateOffset or string.method
+({'backfill'/'bfill', 'pad'/'ffill'}, default None)
+— Method to use for filling holes in reindexed Series (note thisdoes not fill NaNs that already were present):how
+({'start', 'end'}, default end)
+— For PeriodIndex only (see PeriodIndex.asfreq).normalize
+(bool, default False)
+— Whether to reset output index to midnight.fill_value
+(scalar, optional)
+— Value to use for missing values, applied during upsampling (notethis does not fill NaNs that already were present).
+Series/DataFrame object reindexed to the specified frequency.
reindex : Conform DataFrame to new index with optional filling logic.
Notes
+To learn more about the frequency strings, please see this link
+<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>
__.
Start by creating a series with 4 one minute timestamps.
>>> index = pd.date_range('1/1/2000', periods=4, freq='min')
+>>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)
+>>> df = pd.DataFrame({'s': series})
+>>> df
+ s
+2000-01-01 00:00:00 0.0
+2000-01-01 00:01:00 NaN
+2000-01-01 00:02:00 2.0
+2000-01-01 00:03:00 3.0
+
Upsample the series into 30 second bins.
+>>> df.asfreq(freq='30s')
+ s
+2000-01-01 00:00:00 0.0
+2000-01-01 00:00:30 NaN
+2000-01-01 00:01:00 NaN
+2000-01-01 00:01:30 NaN
+2000-01-01 00:02:00 2.0
+2000-01-01 00:02:30 NaN
+2000-01-01 00:03:00 3.0
+
Upsample again, providing a fill value
.
>>> df.asfreq(freq='30s', fill_value=9.0)
+ s
+2000-01-01 00:00:00 0.0
+2000-01-01 00:00:30 9.0
+2000-01-01 00:01:00 NaN
+2000-01-01 00:01:30 9.0
+2000-01-01 00:02:00 2.0
+2000-01-01 00:02:30 9.0
+2000-01-01 00:03:00 3.0
+
Upsample again, providing a method
.
>>> df.asfreq(freq='30s', method='bfill')
+ s
+2000-01-01 00:00:00 0.0
+2000-01-01 00:00:30 NaN
+2000-01-01 00:01:00 NaN
+2000-01-01 00:01:30 2.0
+2000-01-01 00:02:00 2.0
+2000-01-01 00:02:30 3.0
+2000-01-01 00:03:00 3.0
+
at_time
(
time
, asof=False
, axis=None
)
Select values at particular time of day (e.g., 9:30AM).
time
+(datetime.time or str)
+— The values to select.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— For Series
this parameter is unused and defaults to 0.TypeError
+
+— If the index is not a :class:DatetimeIndex
between_time : Select values between particular times of the day.first : Select initial periods of time series based on a date offset. +last : Select final periods of time series based on a date offset. +DatetimeIndex.indexer_at_time : Get just the index locations for + values at particular time of the day.
+>>> i = pd.date_range('2018-04-09', periods=4, freq='12h')>>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
+>>> ts
+ A
+2018-04-09 00:00:00 1
+2018-04-09 12:00:00 2
+2018-04-10 00:00:00 3
+2018-04-10 12:00:00 4
+
>>> ts.at_time('12:00')
+ A
+2018-04-09 12:00:00 2
+2018-04-10 12:00:00 4
+
between_time
(
start_time
, end_time
, inclusive='both'
, axis=None
)
Select values between particular times of the day (e.g., 9:00-9:30 AM).
By setting start_time
to be later than end_time
,
+you can get the times that are not between the two times.
start_time
+(datetime.time or str)
+— Initial time as a time filter limit.end_time
+(datetime.time or str)
+— End time as a time filter limit.inclusive
+({"both", "neither", "left", "right"}, default "both")
+— Include boundaries; whether to set each bound as closed or open.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Determine range time on index or columns value.For Series
this parameter is unused and defaults to 0.
+Data from the original object filtered to the specified dates range.
TypeError
+
+— If the index is not a :class:DatetimeIndex
at_time : Select values at a particular time of the day.first : Select initial periods of time series based on a date offset. +last : Select final periods of time series based on a date offset. +DatetimeIndex.indexer_between_time : Get just the index locations for + values between particular times of the day.
+>>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')>>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
+>>> ts
+ A
+2018-04-09 00:00:00 1
+2018-04-10 00:20:00 2
+2018-04-11 00:40:00 3
+2018-04-12 01:00:00 4
+
>>> ts.between_time('0:15', '0:45')
+ A
+2018-04-10 00:20:00 2
+2018-04-11 00:40:00 3
+
You get the times that are not between two times by setting
+start_time
later than end_time
:
>>> ts.between_time('0:45', '0:15')
+ A
+2018-04-09 00:00:00 1
+2018-04-12 01:00:00 4
+
resample
(
rule
, axis=<no_default>
, closed=None
, label=None
, convention=<no_default>
, kind=<no_default>
, on=None
, level=None
, origin='start_day'
, offset=None
, group_keys=False
)
Resample time-series data.
Convenience method for frequency conversion and resampling of time series.
+The object must have a datetime-like index (DatetimeIndex
, PeriodIndex
,
+or TimedeltaIndex
), or the caller must pass the label of a datetime-like
+series/index to the on
/level
keyword parameter.
rule
+(DateOffset, Timedelta or str)
+— The offset string or object representing target conversion.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Which axis to use for up- or down-sampling. For Series
this parameteris unused and defaults to 0. Must be
+DatetimeIndex
, TimedeltaIndex
or PeriodIndex
.closed
+({'right', 'left'}, default None)
+— Which side of bin interval is closed. The default is 'left'for all frequency offsets except for 'ME', 'YE', 'QE', 'BME',
+'BA', 'BQE', and 'W' which all have a default of 'right'.
+label
+({'right', 'left'}, default None)
+— Which bin edge label to label bucket with. The default is 'left'for all frequency offsets except for 'ME', 'YE', 'QE', 'BME',
+'BA', 'BQE', and 'W' which all have a default of 'right'.
+convention
+({'start', 'end', 's', 'e'}, default 'start')
+— For PeriodIndex
only, controls whether to use the start orend of rule
.kind
+({'timestamp', 'period'}, optional, default None)
+— Pass 'timestamp' to convert the resulting index to aDateTimeIndex
or 'period' to convert it to a PeriodIndex
.
+By default the input representation is retained.on
+(str, optional)
+— For a DataFrame, column to use instead of index for resampling.Column must be datetime-like.
+level
+(str or int, optional)
+— For a MultiIndex, level (name or number) to use forresampling. level
must be datetime-like.
+origin
+(Timestamp or str, default 'start_day')
+— The timestamp on which to adjust the grouping. The timezone of originmust match the timezone of the index.
+If string, must be one of the following:origin
is 1970-01-01origin
is the first value of the timeseriesorigin
is the first day at midnight of the timeseriesorigin
is the last value of the timeseriesorigin
is the ceiling midnight of the last dayOnly takes effect for Tick-frequencies (i.e. fixed frequencies like
+days, hours, and minutes, rather than months or quarters).
+
+offset
+(Timedelta or str, default is None)
+— An offset timedelta added to the origin.group_keys
+(bool, default False)
+— Whether to include the group keys in the result index when using.apply()
on the resampled object.Not specifying ``group_keys`` will retain values-dependent behavior
+from pandas 1.4 and earlier (see :ref:`pandas 1.5.0 Release notes
+<whatsnew_150.enhancements.resample_group_keys>` for examples).
+
+.. versionchanged:: 2.0.0``group_keys`` now defaults to ``False``.
+
+:class:~pandas.core.Resampler
object.
Series.resample : Resample a Series.DataFrame.resample : Resample a DataFrame. +groupby : Group Series/DataFrame by mapping, function, label, or list of labels. +asfreq : Reindex a Series/DataFrame with the given frequency without grouping.
+Notes
+See the user guide
+<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>
__
+for more.
To learn more about the offset strings, please see this link
+<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>
__.
Start by creating a series with 9 one minute timestamps.
>>> index = pd.date_range('1/1/2000', periods=9, freq='min')
+>>> series = pd.Series(range(9), index=index)
+>>> series
+2000-01-01 00:00:00 0
+2000-01-01 00:01:00 1
+2000-01-01 00:02:00 2
+2000-01-01 00:03:00 3
+2000-01-01 00:04:00 4
+2000-01-01 00:05:00 5
+2000-01-01 00:06:00 6
+2000-01-01 00:07:00 7
+2000-01-01 00:08:00 8
+Freq: min, dtype: int64
+
Downsample the series into 3 minute bins and sum the values +of the timestamps falling into a bin.
+>>> series.resample('3min').sum()
+2000-01-01 00:00:00 3
+2000-01-01 00:03:00 12
+2000-01-01 00:06:00 21
+Freq: 3min, dtype: int64
+
Downsample the series into 3 minute bins as above, but label each
+bin using the right edge instead of the left. Please note that the
+value in the bucket used as the label is not included in the bucket,
+which it labels. For example, in the original series the
+bucket 2000-01-01 00:03:00
contains the value 3, but the summed
+value in the resampled bucket with the label 2000-01-01 00:03:00
+does not include 3 (if it did, the summed value would be 6, not 3).
>>> series.resample('3min', label='right').sum()
+2000-01-01 00:03:00 3
+2000-01-01 00:06:00 12
+2000-01-01 00:09:00 21
+Freq: 3min, dtype: int64
+
To include this value close the right side of the bin interval, +as shown below.
+>>> series.resample('3min', label='right', closed='right').sum()
+2000-01-01 00:00:00 0
+2000-01-01 00:03:00 6
+2000-01-01 00:06:00 15
+2000-01-01 00:09:00 15
+Freq: 3min, dtype: int64
+
Upsample the series into 30 second bins.
+>>> series.resample('30s').asfreq()[0:5] # Select first 5 rows
+2000-01-01 00:00:00 0.0
+2000-01-01 00:00:30 NaN
+2000-01-01 00:01:00 1.0
+2000-01-01 00:01:30 NaN
+2000-01-01 00:02:00 2.0
+Freq: 30s, dtype: float64
+
Upsample the series into 30 second bins and fill the NaN
+values using the ffill
method.
>>> series.resample('30s').ffill()[0:5]
+2000-01-01 00:00:00 0
+2000-01-01 00:00:30 0
+2000-01-01 00:01:00 1
+2000-01-01 00:01:30 1
+2000-01-01 00:02:00 2
+Freq: 30s, dtype: int64
+
Upsample the series into 30 second bins and fill the
+NaN
values using the bfill
method.
>>> series.resample('30s').bfill()[0:5]
+2000-01-01 00:00:00 0
+2000-01-01 00:00:30 1
+2000-01-01 00:01:00 1
+2000-01-01 00:01:30 2
+2000-01-01 00:02:00 2
+Freq: 30s, dtype: int64
+
Pass a custom function via apply
>>> def custom_resampler(arraylike):
+... return np.sum(arraylike) + 5
+...
+>>> series.resample('3min').apply(custom_resampler)
+2000-01-01 00:00:00 8
+2000-01-01 00:03:00 17
+2000-01-01 00:06:00 26
+Freq: 3min, dtype: int64
+
For DataFrame objects, the keyword on
can be used to specify the
+column instead of the index for resampling.
>>> d = {'price': [10, 11, 9, 13, 14, 18, 17, 19],
+... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}
+>>> df = pd.DataFrame(d)
+>>> df['week_starting'] = pd.date_range('01/01/2018',
+... periods=8,
+... freq='W')
+>>> df
+ price volume week_starting
+0 10 50 2018-01-07
+1 11 60 2018-01-14
+2 9 40 2018-01-21
+3 13 100 2018-01-28
+4 14 50 2018-02-04
+5 18 100 2018-02-11
+6 17 40 2018-02-18
+7 19 50 2018-02-25
+>>> df.resample('ME', on='week_starting').mean()
+ price volume
+week_starting
+2018-01-31 10.75 62.5
+2018-02-28 17.00 60.0
+
For a DataFrame with MultiIndex, the keyword level
can be used to
+specify on which level the resampling needs to take place.
>>> days = pd.date_range('1/1/2000', periods=4, freq='D')
+>>> d2 = {'price': [10, 11, 9, 13, 14, 18, 17, 19],
+... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}
+>>> df2 = pd.DataFrame(
+... d2,
+... index=pd.MultiIndex.from_product(
+... [days, ['morning', 'afternoon']]
+... )
+... )
+>>> df2
+ price volume
+2000-01-01 morning 10 50
+ afternoon 11 60
+2000-01-02 morning 9 40
+ afternoon 13 100
+2000-01-03 morning 14 50
+ afternoon 18 100
+2000-01-04 morning 17 40
+ afternoon 19 50
+>>> df2.resample('D', level=0).sum()
+ price volume
+2000-01-01 21 110
+2000-01-02 22 140
+2000-01-03 32 150
+2000-01-04 36 90
+
If you want to adjust the start of the bins based on a fixed timestamp:
+>>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
+>>> rng = pd.date_range(start, end, freq='7min')
+>>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
+>>> ts
+2000-10-01 23:30:00 0
+2000-10-01 23:37:00 3
+2000-10-01 23:44:00 6
+2000-10-01 23:51:00 9
+2000-10-01 23:58:00 12
+2000-10-02 00:05:00 15
+2000-10-02 00:12:00 18
+2000-10-02 00:19:00 21
+2000-10-02 00:26:00 24
+Freq: 7min, dtype: int64
+
>>> ts.resample('17min').sum()
+2000-10-01 23:14:00 0
+2000-10-01 23:31:00 9
+2000-10-01 23:48:00 21
+2000-10-02 00:05:00 54
+2000-10-02 00:22:00 24
+Freq: 17min, dtype: int64
+
>>> ts.resample('17min', origin='epoch').sum()
+2000-10-01 23:18:00 0
+2000-10-01 23:35:00 18
+2000-10-01 23:52:00 27
+2000-10-02 00:09:00 39
+2000-10-02 00:26:00 24
+Freq: 17min, dtype: int64
+
>>> ts.resample('17min', origin='2000-01-01').sum()
+2000-10-01 23:24:00 3
+2000-10-01 23:41:00 15
+2000-10-01 23:58:00 45
+2000-10-02 00:15:00 45
+Freq: 17min, dtype: int64
+
If you want to adjust the start of the bins with an offset
Timedelta, the two
+following lines are equivalent:
>>> ts.resample('17min', origin='start').sum()
+2000-10-01 23:30:00 9
+2000-10-01 23:47:00 21
+2000-10-02 00:04:00 54
+2000-10-02 00:21:00 24
+Freq: 17min, dtype: int64
+
>>> ts.resample('17min', offset='23h30min').sum()
+2000-10-01 23:30:00 9
+2000-10-01 23:47:00 21
+2000-10-02 00:04:00 54
+2000-10-02 00:21:00 24
+Freq: 17min, dtype: int64
+
If you want to take the largest Timestamp as the end of the bins:
+>>> ts.resample('17min', origin='end').sum()
+2000-10-01 23:35:00 0
+2000-10-01 23:52:00 18
+2000-10-02 00:09:00 27
+2000-10-02 00:26:00 63
+Freq: 17min, dtype: int64
+
In contrast with the start_day
, you can use end_day
to take the ceiling
+midnight of the largest Timestamp as the end of the bins and drop the bins
+not containing data:
>>> ts.resample('17min', origin='end_day').sum()
+2000-10-01 23:38:00 3
+2000-10-01 23:55:00 15
+2000-10-02 00:12:00 45
+2000-10-02 00:29:00 45
+Freq: 17min, dtype: int64
+
first
(
offset
)
Select initial periods of time series data based on a date offset.
.. deprecated:: 2.1
+ :meth:.first
is deprecated and will be removed in a future version.
+ Please create a mask and filter using .loc
instead.
For a DataFrame with a sorted DatetimeIndex, this function can +select the first few rows based on a date offset.
+offset
+(str, DateOffset or dateutil.relativedelta)
+— The offset length of the data that will be selected. For instance,'1ME' will display all the rows having their index within the first month.
+A subset of the caller.
TypeError
+
+— If the index is not a :class:DatetimeIndex
last : Select final periods of time series based on a date offset.at_time : Select values at a particular time of the day. +between_time : Select values between particular times of the day.
+>>> i = pd.date_range('2018-04-09', periods=4, freq='2D')>>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
+>>> ts
+ A
+2018-04-09 1
+2018-04-11 2
+2018-04-13 3
+2018-04-15 4
+
Get the rows for the first 3 days:
+>>> ts.first('3D')
+ A
+2018-04-09 1
+2018-04-11 2
+
Notice the data for 3 first calendar days were returned, not the first +3 days observed in the dataset, and therefore data for 2018-04-13 was +not returned.
+last
(
offset
)
Select final periods of time series data based on a date offset.
.. deprecated:: 2.1
+ :meth:.last
is deprecated and will be removed in a future version.
+ Please create a mask and filter using .loc
instead.
For a DataFrame with a sorted DatetimeIndex, this function +selects the last few rows based on a date offset.
+offset
+(str, DateOffset, dateutil.relativedelta)
+— The offset length of the data that will be selected. For instance,'3D' will display all the rows having their index within the last 3 days.
+A subset of the caller.
TypeError
+
+— If the index is not a :class:DatetimeIndex
first : Select initial periods of time series based on a date offset.at_time : Select values at a particular time of the day. +between_time : Select values between particular times of the day.
+Notes
+.. deprecated:: 2.1.0
+ Please create a mask and filter using .loc
instead
>>> i = pd.date_range('2018-04-09', periods=4, freq='2D')>>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
+>>> ts
+ A
+2018-04-09 1
+2018-04-11 2
+2018-04-13 3
+2018-04-15 4
+
Get the rows for the last 3 days:
+>>> ts.last('3D') # doctest: +SKIP
+ A
+2018-04-13 3
+2018-04-15 4
+
Notice the data for 3 last calendar days were returned, not the last +3 observed days in the dataset, and therefore data for 2018-04-11 was +not returned.
+rank
(
axis=0
, method='average'
, numeric_only=False
, na_option='keep'
, ascending=True
, pct=False
)
Compute numerical data ranks (1 through n) along axis.
By default, equal values are assigned a rank that is the average of the +ranks of those values.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Index to direct ranking.For Series
this parameter is unused and defaults to 0.
+method
+({'average', 'min', 'max', 'first', 'dense'}, default 'average')
+— How to rank the group of records that have the same value (i.e. ties):numeric_only
+(bool, default False)
+— For DataFrame objects, rank only numeric columns if set to True.numeric_only
is now False
.
+na_option
+({'keep', 'top', 'bottom'}, default 'keep')
+— How to rank NaN values:ascending
+(bool, default True)
+— Whether or not the elements should be ranked in ascending order.pct
+(bool, default False)
+— Whether or not to display the returned rankings in percentileform.
+Return a Series or DataFrame with data ranks as values.
core.groupby.DataFrameGroupBy.rank : Rank of values within each group.core.groupby.SeriesGroupBy.rank : Rank of values within each group.
+>>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',... 'spider', 'snake'],
+... 'Number_legs': [4, 2, 4, 8, np.nan]})
+>>> df
+ Animal Number_legs
+0 cat 4.0
+1 penguin 2.0
+2 dog 4.0
+3 spider 8.0
+4 snake NaN
+
Ties are assigned the mean of the ranks (by default) for the group.
+>>> s = pd.Series(range(5), index=list("abcde"))
+>>> s["d"] = s["b"]
+>>> s.rank()
+a 1.0
+b 2.5
+c 4.0
+d 2.5
+e 5.0
+dtype: float64
+
The following example shows how the method behaves with the above +parameters:
+method = 'max'
the records that have the
+ same values are ranked using the highest rank (e.g.: since 'cat'
+ and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)na_option = 'bottom'
, if there are records
+ with NaN values they are placed at the bottom of the ranking.pct = True
, the ranking is expressed as
+ percentile rank.>>> df['default_rank'] = df['Number_legs'].rank()
+>>> df['max_rank'] = df['Number_legs'].rank(method='max')
+>>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
+>>> df['pct_rank'] = df['Number_legs'].rank(pct=True)
+>>> df
+ Animal Number_legs default_rank max_rank NA_bottom pct_rank
+0 cat 4.0 2.5 3.0 2.5 0.625
+1 penguin 2.0 1.0 1.0 1.0 0.250
+2 dog 4.0 2.5 3.0 2.5 0.625
+3 spider 8.0 4.0 4.0 4.0 1.000
+4 snake NaN NaN NaN 5.0 NaN
+
align
(
other
, join='outer'
, axis=None
, level=None
, copy=None
, fill_value=None
, method=<no_default>
, limit=<no_default>
, fill_axis=<no_default>
, broadcast_axis=<no_default>
)
Align two objects on their axes with the specified join method.
Join method is specified for each axis Index.
+join
+({'outer', 'inner', 'left', 'right'}, default 'outer')
+— Type of alignment to be performed.axis
+(allowed axis of the other object, default None)
+— Align on index (0), columns (1), or both (None).level
+(int or level name, default None)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+copy
+(bool, default True)
+— Always returns new objects. If copy=False and no reindexing isrequired then original objects are returned.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+fill_value
+(scalar, default np.nan)
+— Value to use for missing values. Defaults to NaN, but can be any"compatible" value.
+method
+({'backfill', 'bfill', 'pad', 'ffill', None}, default None)
+— Method to use for filling holes in reindexed Series:limit
+(int, default None)
+— If method is specified, this is the maximum number of consecutiveNaN values to forward/backward fill. In other words, if there is
+a gap with more than this number of consecutive NaNs, it will only
+be partially filled. If method is not specified, this is the
+maximum number of entries along the entire axis where NaNs will be
+filled. Must be greater than 0 if not None.fill_axis
+({0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame, default 0)
+— Filling axis, method and limit.broadcast_axis
+({0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame, default None)
+— Broadcast values along this axis, if aligning two objects ofdifferent dimensions.Aligned objects.
>>> df = pd.DataFrame(... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]
+... )
+>>> other = pd.DataFrame(
+... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
+... columns=["A", "B", "C", "D"],
+... index=[2, 3, 4],
+... )
+>>> df
+ D B E A
+1 1 2 3 4
+2 6 7 8 9
+>>> other
+ A B C D
+2 10 20 30 40
+3 60 70 80 90
+4 600 700 800 900
+
Align on columns:
+>>> left, right = df.align(other, join="outer", axis=1)
+>>> left
+ A B C D E
+1 4 2 NaN 1 3
+2 9 7 NaN 6 8
+>>> right
+ A B C D E
+2 10 20 30 40 NaN
+3 60 70 80 90 NaN
+4 600 700 800 900 NaN
+
We can also align on the index:
+>>> left, right = df.align(other, join="outer", axis=0)
+>>> left
+ D B E A
+1 1.0 2.0 3.0 4.0
+2 6.0 7.0 8.0 9.0
+3 NaN NaN NaN NaN
+4 NaN NaN NaN NaN
+>>> right
+ A B C D
+1 NaN NaN NaN NaN
+2 10.0 20.0 30.0 40.0
+3 60.0 70.0 80.0 90.0
+4 600.0 700.0 800.0 900.0
+
Finally, the default axis=None
will align on both index and columns:
>>> left, right = df.align(other, join="outer", axis=None)
+>>> left
+ A B C D E
+1 4.0 2.0 NaN 1.0 3.0
+2 9.0 7.0 NaN 6.0 8.0
+3 NaN NaN NaN NaN NaN
+4 NaN NaN NaN NaN NaN
+>>> right
+ A B C D E
+1 NaN NaN NaN NaN NaN
+2 10.0 20.0 30.0 40.0 NaN
+3 60.0 70.0 80.0 90.0 NaN
+4 600.0 700.0 800.0 900.0 NaN
+
where
(
cond
, other=nan
, inplace=False
, axis=None
, level=None
)
Replace values where the condition is False.
cond
+(bool Series/DataFrame, array-like, or callable)
+— Where cond
is True, keep the original value. WhereFalse, replace with corresponding value from other
.
+If cond
is callable, it is computed on the Series/DataFrame and
+should return boolean Series/DataFrame or array. The callable must
+not change input Series/DataFrame (though pandas doesn't check it).
+other
+(scalar, Series/DataFrame, or callable)
+— Entries where cond
is False are replaced withcorresponding value from other
.
+If other is callable, it is computed on the Series/DataFrame and
+should return scalar or Series/DataFrame. The callable must not
+change input Series/DataFrame (though pandas doesn't check it).
+If not specified, entries will be filled with the corresponding
+NULL value (np.nan
for numpy dtypes, pd.NA
for extension
+dtypes).
+inplace
+(bool, default False)
+— Whether to perform the operation in place on the data.axis
+(int, default None)
+— Alignment axis if needed. For Series
this parameter isunused and defaults to 0.
+level
+(int, default None)
+— Alignment level if needed.:func:DataFrame.mask
: Return an object of same shape as self.
Notes
+The where method is an application of the if-then idiom. For each
+element in the calling DataFrame, if cond
is True
the
+element is used; otherwise the corresponding element from the DataFrame
+other
is used. If the axis of other
does not align with axis of
+cond
Series/DataFrame, the misaligned index positions will be filled with
+False.
The signature for :func:DataFrame.where
differs from
+:func:numpy.where
. Roughly df1.where(m, df2)
is equivalent to
+np.where(m, df1, df2)
.
For further details and examples see the where
documentation in
+:ref:indexing <indexing.where_mask>
.
The dtype of the object takes precedence. The fill value is casted to +the object's dtype, if this can be done losslessly.
+>>> s = pd.Series(range(5))>>> s.where(s > 0)
+0 NaN
+1 1.0
+2 2.0
+3 3.0
+4 4.0
+dtype: float64
+>>> s.mask(s > 0)
+0 0.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
>>> s = pd.Series(range(5))
+>>> t = pd.Series([True, False])
+>>> s.where(t, 99)
+0 0
+1 99
+2 99
+3 99
+4 99
+dtype: int64
+>>> s.mask(t, 99)
+0 99
+1 1
+2 99
+3 99
+4 99
+dtype: int64
+
>>> s.where(s > 1, 10)
+0 10
+1 10
+2 2
+3 3
+4 4
+dtype: int64
+>>> s.mask(s > 1, 10)
+0 0
+1 1
+2 10
+3 10
+4 10
+dtype: int64
+
>>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
+>>> df
+ A B
+0 0 1
+1 2 3
+2 4 5
+3 6 7
+4 8 9
+>>> m = df % 3 == 0
+>>> df.where(m, -df)
+ A B
+0 0 -1
+1 -2 3
+2 -4 -5
+3 6 -7
+4 -8 9
+>>> df.where(m, -df) == np.where(m, df, -df)
+ A B
+0 True True
+1 True True
+2 True True
+3 True True
+4 True True
+>>> df.where(m, -df) == df.mask(~m, -df)
+ A B
+0 True True
+1 True True
+2 True True
+3 True True
+4 True True
+
mask
(
cond
, other=<no_default>
, inplace=False
, axis=None
, level=None
)
Replace values where the condition is True.
cond
+(bool Series/DataFrame, array-like, or callable)
+— Where cond
is False, keep the original value. WhereTrue, replace with corresponding value from other
.
+If cond
is callable, it is computed on the Series/DataFrame and
+should return boolean Series/DataFrame or array. The callable must
+not change input Series/DataFrame (though pandas doesn't check it).
+other
+(scalar, Series/DataFrame, or callable)
+— Entries where cond
is True are replaced withcorresponding value from other
.
+If other is callable, it is computed on the Series/DataFrame and
+should return scalar or Series/DataFrame. The callable must not
+change input Series/DataFrame (though pandas doesn't check it).
+If not specified, entries will be filled with the corresponding
+NULL value (np.nan
for numpy dtypes, pd.NA
for extension
+dtypes).
+inplace
+(bool, default False)
+— Whether to perform the operation in place on the data.axis
+(int, default None)
+— Alignment axis if needed. For Series
this parameter isunused and defaults to 0.
+level
+(int, default None)
+— Alignment level if needed.:func:DataFrame.where
: Return an object of same shape as self.
Notes
+The mask method is an application of the if-then idiom. For each
+element in the calling DataFrame, if cond
is False
the
+element is used; otherwise the corresponding element from the DataFrame
+other
is used. If the axis of other
does not align with axis of
+cond
Series/DataFrame, the misaligned index positions will be filled with
+True.
The signature for :func:DataFrame.where
differs from
+:func:numpy.where
. Roughly df1.where(m, df2)
is equivalent to
+np.where(m, df1, df2)
.
For further details and examples see the mask
documentation in
+:ref:indexing <indexing.where_mask>
.
The dtype of the object takes precedence. The fill value is casted to +the object's dtype, if this can be done losslessly.
+>>> s = pd.Series(range(5))>>> s.where(s > 0)
+0 NaN
+1 1.0
+2 2.0
+3 3.0
+4 4.0
+dtype: float64
+>>> s.mask(s > 0)
+0 0.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
>>> s = pd.Series(range(5))
+>>> t = pd.Series([True, False])
+>>> s.where(t, 99)
+0 0
+1 99
+2 99
+3 99
+4 99
+dtype: int64
+>>> s.mask(t, 99)
+0 99
+1 1
+2 99
+3 99
+4 99
+dtype: int64
+
>>> s.where(s > 1, 10)
+0 10
+1 10
+2 2
+3 3
+4 4
+dtype: int64
+>>> s.mask(s > 1, 10)
+0 0
+1 1
+2 10
+3 10
+4 10
+dtype: int64
+
>>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
+>>> df
+ A B
+0 0 1
+1 2 3
+2 4 5
+3 6 7
+4 8 9
+>>> m = df % 3 == 0
+>>> df.where(m, -df)
+ A B
+0 0 -1
+1 -2 3
+2 -4 -5
+3 6 -7
+4 -8 9
+>>> df.where(m, -df) == np.where(m, df, -df)
+ A B
+0 True True
+1 True True
+2 True True
+3 True True
+4 True True
+>>> df.where(m, -df) == df.mask(~m, -df)
+ A B
+0 True True
+1 True True
+2 True True
+3 True True
+4 True True
+
truncate
(
before=None
, after=None
, axis=None
, copy=None
)
Truncate a Series or DataFrame before and after some index value.
This is a useful shorthand for boolean indexing based on index +values above or below certain thresholds.
+before
+(date, str, int)
+— Truncate all rows before this index value.after
+(date, str, int)
+— Truncate all rows after this index value.axis
+({0 or 'index', 1 or 'columns'}, optional)
+— Axis to truncate. Truncates the index (rows) by default.For Series
this parameter is unused and defaults to 0.
+copy
+(bool, default is True,)
+— Return a copy of the truncated section.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+The truncated Series or DataFrame.
DataFrame.loc : Select a subset of a DataFrame by label.DataFrame.iloc : Select a subset of a DataFrame by position.
+Notes
+If the index being truncated contains only datetime values,
+before
and after
may be specified as strings instead of
+Timestamps.
>>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],... 'B': ['f', 'g', 'h', 'i', 'j'],
+... 'C': ['k', 'l', 'm', 'n', 'o']},
+... index=[1, 2, 3, 4, 5])
+>>> df
+ A B C
+1 a f k
+2 b g l
+3 c h m
+4 d i n
+5 e j o
+
>>> df.truncate(before=2, after=4)
+ A B C
+2 b g l
+3 c h m
+4 d i n
+
The columns of a DataFrame can be truncated.
+>>> df.truncate(before="A", after="B", axis="columns")
+ A B
+1 a f
+2 b g
+3 c h
+4 d i
+5 e j
+
For Series, only rows can be truncated.
+>>> df['A'].truncate(before=2, after=4)
+2 b
+3 c
+4 d
+Name: A, dtype: object
+
The index values in truncate
can be datetimes or string
+dates.
>>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')
+>>> df = pd.DataFrame(index=dates, data={'A': 1})
+>>> df.tail()
+ A
+2016-01-31 23:59:56 1
+2016-01-31 23:59:57 1
+2016-01-31 23:59:58 1
+2016-01-31 23:59:59 1
+2016-02-01 00:00:00 1
+
>>> df.truncate(before=pd.Timestamp('2016-01-05'),
+... after=pd.Timestamp('2016-01-10')).tail()
+ A
+2016-01-09 23:59:56 1
+2016-01-09 23:59:57 1
+2016-01-09 23:59:58 1
+2016-01-09 23:59:59 1
+2016-01-10 00:00:00 1
+
Because the index is a DatetimeIndex containing only dates, we can
+specify before
and after
as strings. They will be coerced to
+Timestamps before truncation.
>>> df.truncate('2016-01-05', '2016-01-10').tail()
+ A
+2016-01-09 23:59:56 1
+2016-01-09 23:59:57 1
+2016-01-09 23:59:58 1
+2016-01-09 23:59:59 1
+2016-01-10 00:00:00 1
+
Note that truncate
assumes a 0 value for any unspecified time
+component (midnight). This differs from partial string slicing, which
+returns any partially matching dates.
>>> df.loc['2016-01-05':'2016-01-10', :].tail()
+ A
+2016-01-10 23:59:55 1
+2016-01-10 23:59:56 1
+2016-01-10 23:59:57 1
+2016-01-10 23:59:58 1
+2016-01-10 23:59:59 1
+
tz_convert
(
tz
, axis=0
, level=None
, copy=None
)
Convert tz-aware axis to target time zone.
tz
+(str or tzinfo object or None)
+— Target time zone. Passing None
will convert toUTC and remove the timezone information.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to convertlevel
+(int, str, default None)
+— If axis is a MultiIndex, convert a specific level. Otherwisemust be None.
+copy
+(bool, default True)
+— Also make a copy of the underlying data.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+Object with time zone converted axis.
TypeError
+
+— If the axis is tz-naive.Change to another time zone:
>>> s = pd.Series(
+... [1],
+... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']),
+... )
+>>> s.tz_convert('Asia/Shanghai')
+2018-09-15 07:30:00+08:00 1
+dtype: int64
+
Pass None to convert to UTC and get a tz-naive index:
+>>> s = pd.Series([1],
+... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))
+>>> s.tz_convert(None)
+2018-09-14 23:30:00 1
+dtype: int64
+
tz_localize
(
tz
, axis=0
, level=None
, copy=None
, ambiguous='raise'
, nonexistent='raise'
)
Localize tz-naive index of a Series or DataFrame to target time zone.
This operation localizes the Index. To localize the values in a
+timezone-naive Series, use :meth:Series.dt.tz_localize
.
tz
+(str or tzinfo or None)
+— Time zone to localize. Passing None
will remove thetime zone information and preserve local time.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to localizelevel
+(int, str, default None)
+— If axis ia a MultiIndex, localize a specific level. Otherwisemust be None.
+copy
+(bool, default True)
+— Also make a copy of the underlying data.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+ambiguous
+('infer', bool-ndarray, 'NaT', default 'raise')
+— When clocks moved backward due to DST, ambiguous times may arise.For example in Central European Time (UTC+01), when going from
+03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
+00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
+ambiguous
parameter dictates how ambiguous times should be
+handled.nonexistent
+(str, default 'raise')
+— A nonexistent time does not exist in a particular timezonewhere clocks moved forward due to DST. Valid values are:Same type as the input.
TypeError
+
+— If the TimeSeries is tz-aware and tz is not None.Localize local times:
>>> s = pd.Series(
+... [1],
+... index=pd.DatetimeIndex(['2018-09-15 01:30:00']),
+... )
+>>> s.tz_localize('CET')
+2018-09-15 01:30:00+02:00 1
+dtype: int64
+
Pass None to convert to tz-naive index and preserve local time:
+>>> s = pd.Series([1],
+... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))
+>>> s.tz_localize(None)
+2018-09-15 01:30:00 1
+dtype: int64
+
Be careful with DST changes. When there is sequential data, pandas +can infer the DST time:
+>>> s = pd.Series(range(7),
+... index=pd.DatetimeIndex(['2018-10-28 01:30:00',
+... '2018-10-28 02:00:00',
+... '2018-10-28 02:30:00',
+... '2018-10-28 02:00:00',
+... '2018-10-28 02:30:00',
+... '2018-10-28 03:00:00',
+... '2018-10-28 03:30:00']))
+>>> s.tz_localize('CET', ambiguous='infer')
+2018-10-28 01:30:00+02:00 0
+2018-10-28 02:00:00+02:00 1
+2018-10-28 02:30:00+02:00 2
+2018-10-28 02:00:00+01:00 3
+2018-10-28 02:30:00+01:00 4
+2018-10-28 03:00:00+01:00 5
+2018-10-28 03:30:00+01:00 6
+dtype: int64
+
In some cases, inferring the DST is impossible. In such cases, you can +pass an ndarray to the ambiguous parameter to set the DST explicitly
+>>> s = pd.Series(range(3),
+... index=pd.DatetimeIndex(['2018-10-28 01:20:00',
+... '2018-10-28 02:36:00',
+... '2018-10-28 03:46:00']))
+>>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))
+2018-10-28 01:20:00+02:00 0
+2018-10-28 02:36:00+02:00 1
+2018-10-28 03:46:00+01:00 2
+dtype: int64
+
If the DST transition causes nonexistent times, you can shift these
+dates forward or backward with a timedelta object or 'shift_forward'
+or 'shift_backward'
.
>>> s = pd.Series(range(2),
+... index=pd.DatetimeIndex(['2015-03-29 02:30:00',
+... '2015-03-29 03:30:00']))
+>>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
+2015-03-29 03:00:00+02:00 0
+2015-03-29 03:30:00+02:00 1
+dtype: int64
+>>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
+2015-03-29 01:59:59.999999999+01:00 0
+2015-03-29 03:30:00+02:00 1
+dtype: int64
+>>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h'))
+2015-03-29 03:30:00+02:00 0
+2015-03-29 03:30:00+02:00 1
+dtype: int64
+
describe
(
percentiles=None
, include=None
, exclude=None
)
Generate descriptive statistics.
Descriptive statistics include those that summarize the central
+tendency, dispersion and shape of a
+dataset's distribution, excluding NaN
values.
Analyzes both numeric and object series, as well
+as DataFrame
column sets of mixed data types. The output
+will vary depending on what is provided. Refer to the notes
+below for more detail.
percentiles
+(list-like of numbers, optional)
+— The percentiles to include in the output. All shouldfall between 0 and 1. The default is
+[.25, .5, .75]
, which returns the 25th, 50th, and
+75th percentiles.
+include
+('all', list-like of dtypes or None (default), optional)
+— A white list of data types to include in the result. Ignoredfor Series
. Here are the options:numpy.number
. To limit it instead to object columns submit
+ the numpy.object
data type. Strings
+ can also be used in the style of
+ select_dtypes
(e.g. df.describe(include=['O'])
). To
+ select pandas categorical columns, use 'category'
exclude
+(list-like of dtypes or None (default), optional,)
+— A black list of data types to omit from the result. Ignoredfor Series
. Here are the options:numpy.number
. To exclude object columns submit the data
+ type numpy.object
. Strings can also be used in the style of
+ select_dtypes
(e.g. df.describe(exclude=['O'])
). To
+ exclude pandas categorical columns, use 'category'
Summary statistics of the Series or Dataframe provided.
DataFrame.count: Count number of non-NA/null observations.DataFrame.max: Maximum of the values in the object. +DataFrame.min: Minimum of the values in the object. +DataFrame.mean: Mean of the values. +DataFrame.std: Standard deviation of the observations. +DataFrame.select_dtypes: Subset of a DataFrame including/excluding + columns based on their dtype.
+Notes
+For numeric data, the result's index will include count
,
+mean
, std
, min
, max
as well as lower, 50
and
+upper percentiles. By default the lower percentile is 25
and the
+upper percentile is 75
. The 50
percentile is the
+same as the median.
For object data (e.g. strings or timestamps), the result's index
+will include count
, unique
, top
, and freq
. The top
+is the most common value. The freq
is the most common value's
+frequency. Timestamps also include the first
and last
items.
If multiple object values have the highest count, then the
+count
and top
results will be arbitrarily chosen from
+among those with the highest count.
For mixed data types provided via a DataFrame
, the default is to
+return only an analysis of numeric columns. If the dataframe consists
+only of object and categorical data without any numeric columns, the
+default is to return an analysis of both the object and categorical
+columns. If include='all'
is provided as an option, the result
+will include a union of attributes of each type.
The include
and exclude
parameters can be used to limit
+which columns in a DataFrame
are analyzed for the output.
+The parameters are ignored when analyzing a Series
.
Describing a numeric Series
.
>>> s = pd.Series([1, 2, 3])
+>>> s.describe()
+count 3.0
+mean 2.0
+std 1.0
+min 1.0
+25% 1.5
+50% 2.0
+75% 2.5
+max 3.0
+dtype: float64
+
Describing a categorical Series
.
>>> s = pd.Series(['a', 'a', 'b', 'c'])
+>>> s.describe()
+count 4
+unique 3
+top a
+freq 2
+dtype: object
+
Describing a timestamp Series
.
>>> s = pd.Series([
+... np.datetime64("2000-01-01"),
+... np.datetime64("2010-01-01"),
+... np.datetime64("2010-01-01")
+... ])
+>>> s.describe()
+count 3
+mean 2006-09-01 08:00:00
+min 2000-01-01 00:00:00
+25% 2004-12-31 12:00:00
+50% 2010-01-01 00:00:00
+75% 2010-01-01 00:00:00
+max 2010-01-01 00:00:00
+dtype: object
+
Describing a DataFrame
. By default only numeric fields
+are returned.
>>> df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']),
+... 'numeric': [1, 2, 3],
+... 'object': ['a', 'b', 'c']
+... })
+>>> df.describe()
+ numeric
+count 3.0
+mean 2.0
+std 1.0
+min 1.0
+25% 1.5
+50% 2.0
+75% 2.5
+max 3.0
+
Describing all columns of a DataFrame
regardless of data type.
>>> df.describe(include='all') # doctest: +SKIP
+ categorical numeric object
+count 3 3.0 3
+unique 3 NaN 3
+top f NaN a
+freq 1 NaN 1
+mean NaN 2.0 NaN
+std NaN 1.0 NaN
+min NaN 1.0 NaN
+25% NaN 1.5 NaN
+50% NaN 2.0 NaN
+75% NaN 2.5 NaN
+max NaN 3.0 NaN
+
Describing a column from a DataFrame
by accessing it as
+an attribute.
>>> df.numeric.describe()
+count 3.0
+mean 2.0
+std 1.0
+min 1.0
+25% 1.5
+50% 2.0
+75% 2.5
+max 3.0
+Name: numeric, dtype: float64
+
Including only numeric columns in a DataFrame
description.
>>> df.describe(include=[np.number])
+ numeric
+count 3.0
+mean 2.0
+std 1.0
+min 1.0
+25% 1.5
+50% 2.0
+75% 2.5
+max 3.0
+
Including only string columns in a DataFrame
description.
>>> df.describe(include=[object]) # doctest: +SKIP
+ object
+count 3
+unique 3
+top a
+freq 1
+
Including only categorical columns from a DataFrame
description.
>>> df.describe(include=['category'])
+ categorical
+count 3
+unique 3
+top d
+freq 1
+
Excluding numeric columns from a DataFrame
description.
>>> df.describe(exclude=[np.number]) # doctest: +SKIP
+ categorical object
+count 3 3
+unique 3 3
+top f a
+freq 1 1
+
Excluding object columns from a DataFrame
description.
>>> df.describe(exclude=[object]) # doctest: +SKIP
+ categorical numeric
+count 3 3.0
+unique 3 NaN
+top f NaN
+freq 1 NaN
+mean NaN 2.0
+std NaN 1.0
+min NaN 1.0
+25% NaN 1.5
+50% NaN 2.0
+75% NaN 2.5
+max NaN 3.0
+
pct_change
(
periods=1
, fill_method=<no_default>
, limit=<no_default>
, freq=None
, **kwargs
)
Fractional change between the current and a prior element.
Computes the fractional change from the immediately previous row by +default. This is useful in comparing the fraction of change in a time +series of elements.
+.. note::
+Despite the name of this method, it calculates fractional change
+(also known as per unit change or relative change) and not
+percentage change. If you need the percentage change, multiply
+these values by 100.
+
+periods
+(int, default 1)
+— Periods to shift for forming percent change.fill_method
+({'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad')
+— How to handle NAs before computing percent changes.fill_method
are deprecated except fill_method=None
.
+limit
+(int, default None)
+— The number of consecutive NAs to fill before stopping.freq
+(DateOffset, timedelta, or str, optional)
+— Increment to use from time series API (e.g. 'ME' or BDay()).**kwargs
+
+— Additional keyword arguments are passed intoDataFrame.shift
or Series.shift
.
+The same type as the calling object.
Series.diff : Compute the difference of two elements in a Series.DataFrame.diff : Compute the difference of two elements in a DataFrame. +Series.shift : Shift the index by some number of periods. +DataFrame.shift : Shift the index by some number of periods.
+Series
>>> s = pd.Series([90, 91, 85])
+>>> s
+0 90
+1 91
+2 85
+dtype: int64
+
>>> s.pct_change()
+0 NaN
+1 0.011111
+2 -0.065934
+dtype: float64
+
>>> s.pct_change(periods=2)
+0 NaN
+1 NaN
+2 -0.055556
+dtype: float64
+
See the percentage change in a Series where filling NAs with last +valid observation forward to next valid.
+>>> s = pd.Series([90, 91, None, 85])
+>>> s
+0 90.0
+1 91.0
+2 NaN
+3 85.0
+dtype: float64
+
>>> s.ffill().pct_change()
+0 NaN
+1 0.011111
+2 0.000000
+3 -0.065934
+dtype: float64
+
DataFrame
+Percentage change in French franc, Deutsche Mark, and Italian lira from +1980-01-01 to 1980-03-01.
+>>> df = pd.DataFrame({
+... 'FR': [4.0405, 4.0963, 4.3149],
+... 'GR': [1.7246, 1.7482, 1.8519],
+... 'IT': [804.74, 810.01, 860.13]},
+... index=['1980-01-01', '1980-02-01', '1980-03-01'])
+>>> df
+ FR GR IT
+1980-01-01 4.0405 1.7246 804.74
+1980-02-01 4.0963 1.7482 810.01
+1980-03-01 4.3149 1.8519 860.13
+
>>> df.pct_change()
+ FR GR IT
+1980-01-01 NaN NaN NaN
+1980-02-01 0.013810 0.013684 0.006549
+1980-03-01 0.053365 0.059318 0.061876
+
Percentage of change in GOOG and APPL stock volume. Shows computing +the percentage change between columns.
+>>> df = pd.DataFrame({
+... '2016': [1769950, 30586265],
+... '2015': [1500923, 40912316],
+... '2014': [1371819, 41403351]},
+... index=['GOOG', 'APPL'])
+>>> df
+ 2016 2015 2014
+GOOG 1769950 1500923 1371819
+APPL 30586265 40912316 41403351
+
>>> df.pct_change(axis='columns', periods=-1)
+ 2016 2015 2014
+GOOG 0.179241 0.094112 NaN
+APPL -0.252395 -0.011860 NaN
+
rolling
(
window
, min_periods=None
, center=False
, win_type=None
, on=None
, axis=<no_default>
, closed=None
, step=None
, method='single'
)
Provide rolling window calculations.
window
+(int, timedelta, str, offset, or BaseIndexer subclass)
+— Size of the moving window.this link
+<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>
__.get_window_bounds
method. Additional rolling
+keyword arguments, namely min_periods
, center
, closed
and
+step
will be passed to get_window_bounds
.
+min_periods
+(int, default None)
+— Minimum number of observations in window required to have a value;otherwise, result is np.nan
.min_periods
will default to 1.min_periods
will default
+to the size of the window.
+center
+(bool, default False)
+— If False, set the window labels as the right edge of the window index.win_type
+(str, default None)
+— If None
, all points are evenly weighted.scipy.signal window function
+<https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>
__.on
+(str, optional)
+— For a DataFrame, a column label or Index level on whichto calculate the rolling window, rather than the DataFrame's index.axis
+(int or str, default 0)
+— If 0
or 'index'
, roll across the rows.1
or 'columns'
, roll across the columns.Series
this parameter is unused and defaults to 0.The axis keyword is deprecated. For ``axis=1``,
+transpose the DataFrame first instead.
+
+closed
+(str, default None)
+— If 'right'
, the first point in the window is excluded from calculations.'left'
, the last point in the window is excluded from calculations.'both'
, the no points in the window are excluded from calculations.'neither'
, the first and last points in the window are excluded
+from calculations.None
('right'
).
+step
+(int, default None)
+— 0method
+(str {'single', 'table'}, default 'single')
+— 0An instance of Window is returned if win_type
is passed. Otherwise,an instance of Rolling is returned.
expanding : Provides expanding transformations.ewm : Provides exponential weighted functions.
+Notes
+See :ref:Windowing Operations <window.generic>
for further usage details
+and examples.
>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})>>> df
+ B
+0 0.0
+1 1.0
+2 2.0
+3 NaN
+4 4.0
+
window
+Rolling sum with a window length of 2 observations.
+>>> df.rolling(2).sum()
+ B
+0 NaN
+1 1.0
+2 3.0
+3 NaN
+4 NaN
+
Rolling sum with a window span of 2 seconds.
+>>> df_time = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]},
+... index=[pd.Timestamp('20130101 09:00:00'),
+... pd.Timestamp('20130101 09:00:02'),
+... pd.Timestamp('20130101 09:00:03'),
+... pd.Timestamp('20130101 09:00:05'),
+... pd.Timestamp('20130101 09:00:06')])
+
>>> df_time
+ B
+2013-01-01 09:00:00 0.0
+2013-01-01 09:00:02 1.0
+2013-01-01 09:00:03 2.0
+2013-01-01 09:00:05 NaN
+2013-01-01 09:00:06 4.0
+
>>> df_time.rolling('2s').sum()
+ B
+2013-01-01 09:00:00 0.0
+2013-01-01 09:00:02 1.0
+2013-01-01 09:00:03 3.0
+2013-01-01 09:00:05 NaN
+2013-01-01 09:00:06 4.0
+
Rolling sum with forward looking windows with 2 observations.
+>>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2)
+>>> df.rolling(window=indexer, min_periods=1).sum()
+ B
+0 1.0
+1 3.0
+2 2.0
+3 4.0
+4 4.0
+
min_periods
+Rolling sum with a window length of 2 observations, but only needs a minimum of 1 +observation to calculate a value.
+>>> df.rolling(2, min_periods=1).sum()
+ B
+0 0.0
+1 1.0
+2 3.0
+3 2.0
+4 4.0
+
center
+Rolling sum with the result assigned to the center of the window index.
+>>> df.rolling(3, min_periods=1, center=True).sum()
+ B
+0 1.0
+1 3.0
+2 3.0
+3 6.0
+4 4.0
+
>>> df.rolling(3, min_periods=1, center=False).sum()
+ B
+0 0.0
+1 1.0
+2 3.0
+3 3.0
+4 6.0
+
step
+Rolling sum with a window length of 2 observations, minimum of 1 observation to +calculate a value, and a step of 2.
+>>> df.rolling(2, min_periods=1, step=2).sum()
+ B
+0 0.0
+2 3.0
+4 4.0
+
win_type
+Rolling sum with a window length of 2, using the Scipy 'gaussian'
+window type. std
is required in the aggregation function.
>>> df.rolling(2, win_type='gaussian').sum(std=3)
+ B
+0 NaN
+1 0.986207
+2 2.958621
+3 NaN
+4 NaN
+
on
+Rolling sum with a window length of 2 days.
+>>> df = pd.DataFrame({
+... 'A': [pd.to_datetime('2020-01-01'),
+... pd.to_datetime('2020-01-01'),
+... pd.to_datetime('2020-01-02'),],
+... 'B': [1, 2, 3], },
+... index=pd.date_range('2020', periods=3))
+
>>> df
+ A B
+2020-01-01 2020-01-01 1
+2020-01-02 2020-01-01 2
+2020-01-03 2020-01-02 3
+
>>> df.rolling('2D', on='A').sum()
+ A B
+2020-01-01 2020-01-01 1.0
+2020-01-02 2020-01-01 3.0
+2020-01-03 2020-01-02 6.0
+
expanding
(
min_periods=1
, axis=<no_default>
, method='single'
)
Provide expanding window calculations.
min_periods
+(int, default 1)
+— Minimum number of observations in window required to have a value;otherwise, result is np.nan
.
+axis
+(int or str, default 0)
+— If 0
or 'index'
, roll across the rows.1
or 'columns'
, roll across the columns.Series
this parameter is unused and defaults to 0.
+method
+(str {'single', 'table'}, default 'single')
+— Execute the rolling operation per single column or row ('single'
)or over the entire object ('table'
).engine='numba'
+in the method call.rolling : Provides rolling window calculations.ewm : Provides exponential weighted functions.
+Notes
+See :ref:Windowing Operations <window.expanding>
for further usage details
+and examples.
>>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})>>> df
+ B
+0 0.0
+1 1.0
+2 2.0
+3 NaN
+4 4.0
+
min_periods
+Expanding sum with 1 vs 3 observations needed to calculate a value.
+>>> df.expanding(1).sum()
+ B
+0 0.0
+1 1.0
+2 3.0
+3 3.0
+4 7.0
+>>> df.expanding(3).sum()
+ B
+0 NaN
+1 NaN
+2 3.0
+3 3.0
+4 7.0
+
ewm
(
com=None
, span=None
, halflife=None
, alpha=None
, min_periods=0
, adjust=True
, ignore_na=False
, axis=<no_default>
, times=None
, method='single'
)
Provide exponentially weighted (EW) calculations.
Exactly one of com
, span
, halflife
, or alpha
must be
+provided if times
is not provided. If times
is provided,
+halflife
and one of com
, span
or alpha
may be provided.
com
+(float, optional)
+— Specify decay in terms of center of mass\alpha = 1 / (1 + com)
, for :math:com \geq 0
.
+span
+(float, optional)
+— Specify decay in terms of span\alpha = 2 / (span + 1)
, for :math:span \geq 1
.
+halflife
+(float, str, timedelta, optional)
+— Specify decay in terms of half-life\alpha = 1 - \exp\left(-\ln(2) / halflife\right)
, for
+:math:halflife > 0
.times
is specified, a timedelta convertible unit over which an
+observation decays to half its value. Only applicable to mean()
,
+and halflife value will not apply to the other functions.
+alpha
+(float, optional)
+— Specify smoothing factor :math:\alpha
directly0 < \alpha \leq 1
.
+min_periods
+(int, default 0)
+— Minimum number of observations in window required to have a value;otherwise, result is np.nan
.
+adjust
+(bool, default True)
+— Divide by decaying adjustment factor in beginning periods to accountfor imbalance in relative weightings (viewing EWMA as a moving average).adjust=True
(default), the EW function is calculated using weights
+ :math:w_i = (1 - \alpha)^i
. For example, the EW moving average of the series
+ [:math:x_0, x_1, ..., x_t
] would be:adjust=False
, the exponentially weighted function is calculated
+ recursively:ignore_na
+(bool, default False)
+— Ignore missing values when calculating weights.ignore_na=False
(default), weights are based on absolute positions.
+ For example, the weights of :math:x_0
and :math:x_2
used in calculating
+ the final weighted average of [:math:x_0
, None, :math:x_2
] are
+ :math:(1-\alpha)^2
and :math:1
if adjust=True
, and
+ :math:(1-\alpha)^2
and :math:\alpha
if adjust=False
.ignore_na=True
, weights are based
+ on relative positions. For example, the weights of :math:x_0
and :math:x_2
+ used in calculating the final weighted average of
+ [:math:x_0
, None, :math:x_2
] are :math:1-\alpha
and :math:1
if
+ adjust=True
, and :math:1-\alpha
and :math:\alpha
if adjust=False
.axis
+({0, 1}, default 0)
+— If 0
or 'index'
, calculate across the rows.1
or 'columns'
, calculate across the columns.Series
this parameter is unused and defaults to 0.
+times
+(np.ndarray, Series, default None)
+— .method
+(str {'single', 'table'}, default 'single')
+— .. versionadded:: 1.4.0'single'
)
+or over the entire object ('table'
).engine='numba'
+in the method call.mean()
+rolling : Provides rolling window calculations.expanding : Provides expanding transformations.
+Notes
+See :ref:Windowing Operations <window.exponentially_weighted>
+for further usage details and examples.
>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})>>> df
+ B
+0 0.0
+1 1.0
+2 2.0
+3 NaN
+4 4.0
+
>>> df.ewm(com=0.5).mean()
+ B
+0 0.000000
+1 0.750000
+2 1.615385
+3 1.615385
+4 3.670213
+>>> df.ewm(alpha=2 / 3).mean()
+ B
+0 0.000000
+1 0.750000
+2 1.615385
+3 1.615385
+4 3.670213
+
adjust
+>>> df.ewm(com=0.5, adjust=True).mean()
+ B
+0 0.000000
+1 0.750000
+2 1.615385
+3 1.615385
+4 3.670213
+>>> df.ewm(com=0.5, adjust=False).mean()
+ B
+0 0.000000
+1 0.666667
+2 1.555556
+3 1.555556
+4 3.650794
+
ignore_na
+>>> df.ewm(com=0.5, ignore_na=True).mean()
+ B
+0 0.000000
+1 0.750000
+2 1.615385
+3 1.615385
+4 3.225000
+>>> df.ewm(com=0.5, ignore_na=False).mean()
+ B
+0 0.000000
+1 0.750000
+2 1.615385
+3 1.615385
+4 3.670213
+
times
+Exponentially weighted mean with weights calculated with a timedelta halflife
+relative to times
.
>>> times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17']
+>>> df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean()
+ B
+0 0.000000
+1 0.585786
+2 1.523889
+3 1.523889
+4 3.233686
+
first_valid_index
(
)
Return index for first non-NA value or None, if no non-NA value is found.
For Series:
>>> s = pd.Series([None, 3, 4])
+>>> s.first_valid_index()
+1
+>>> s.last_valid_index()
+2
+
>>> s = pd.Series([None, None])
+>>> print(s.first_valid_index())
+None
+>>> print(s.last_valid_index())
+None
+
If all elements in Series are NA/null, returns None.
+>>> s = pd.Series()
+>>> print(s.first_valid_index())
+None
+>>> print(s.last_valid_index())
+None
+
If Series is empty, returns None.
+For DataFrame:
+>>> df = pd.DataFrame({'A': [None, None, 2], 'B': [None, 3, 4]})
+>>> df
+ A B
+0 NaN NaN
+1 NaN 3.0
+2 2.0 4.0
+>>> df.first_valid_index()
+1
+>>> df.last_valid_index()
+2
+
>>> df = pd.DataFrame({'A': [None, None, None], 'B': [None, None, None]})
+>>> df
+ A B
+0 None None
+1 None None
+2 None None
+>>> print(df.first_valid_index())
+None
+>>> print(df.last_valid_index())
+None
+
If all elements in DataFrame are NA/null, returns None.
+>>> df = pd.DataFrame()
+>>> df
+Empty DataFrame
+Columns: []
+Index: []
+>>> print(df.first_valid_index())
+None
+>>> print(df.last_valid_index())
+None
+
If DataFrame is empty, returns None.
+last_valid_index
(
)
Return index for last non-NA value or None, if no non-NA value is found.
For Series:
>>> s = pd.Series([None, 3, 4])
+>>> s.first_valid_index()
+1
+>>> s.last_valid_index()
+2
+
>>> s = pd.Series([None, None])
+>>> print(s.first_valid_index())
+None
+>>> print(s.last_valid_index())
+None
+
If all elements in Series are NA/null, returns None.
+>>> s = pd.Series()
+>>> print(s.first_valid_index())
+None
+>>> print(s.last_valid_index())
+None
+
If Series is empty, returns None.
+For DataFrame:
+>>> df = pd.DataFrame({'A': [None, None, 2], 'B': [None, 3, 4]})
+>>> df
+ A B
+0 NaN NaN
+1 NaN 3.0
+2 2.0 4.0
+>>> df.first_valid_index()
+1
+>>> df.last_valid_index()
+2
+
>>> df = pd.DataFrame({'A': [None, None, None], 'B': [None, None, None]})
+>>> df
+ A B
+0 None None
+1 None None
+2 None None
+>>> print(df.first_valid_index())
+None
+>>> print(df.last_valid_index())
+None
+
If all elements in DataFrame are NA/null, returns None.
+>>> df = pd.DataFrame()
+>>> df
+Empty DataFrame
+Columns: []
+Index: []
+>>> print(df.first_valid_index())
+None
+>>> print(df.last_valid_index())
+None
+
If DataFrame is empty, returns None.
+__dataframe__
(
nan_as_null=False
, allow_copy=True
)
Return the dataframe interchange object implementing the interchange protocol.
nan_as_null
+(bool, default False)
+— nan_as_null
is DEPRECATED and has no effect. Please avoid usingit; it will be removed in a future release.
+allow_copy
+(bool, default True)
+— Whether to allow memory copying when exporting. If set to Falseit would cause non-zero-copy exports to fail.
+The object which consuming library can use to ingress the dataframe.
Notes
+Details on the interchange protocol: +https://data-apis.org/dataframe-protocol/latest/index.html
+>>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})>>> interchange_object = df_not_necessarily_pandas.__dataframe__()
+>>> interchange_object.column_names()
+Index(['A', 'B'], dtype='object')
+>>> df_pandas = (pd.api.interchange.from_dataframe
+... (interchange_object.select_columns_by_name(['A'])))
+>>> df_pandas
+ A
+0 1
+1 2
+
These methods (column_names
, select_columns_by_name
) should work
+for any dataframe library which implements the interchange protocol.
__dataframe_consortium_standard__
(
api_version=None
)
→ AnyProvide entry point to the Consortium DataFrame Standard API.
This is developed and maintained outside of pandas. +Please report any issues to https://github.com/data-apis/dataframe-api-compat.
+__arrow_c_stream__
(
requested_schema=None
)
Export the pandas DataFrame as an Arrow C stream PyCapsule.
This relies on pyarrow to convert the pandas DataFrame to the Arrow
+format (and follows the default behaviour of pyarrow.Table.from_pandas
+in its handling of the index, i.e. store the index as a column except
+for RangeIndex).
+This conversion is not necessarily zero-copy.
requested_schema
+(PyCapsule, default None)
+— The schema to which the dataframe should be casted, passed as aPyCapsule containing a C ArrowSchema representation of the
+requested schema.
+__repr__
(
)
→ strReturn a string representation for a particular DataFrame.
to_string
(
buf=None
, columns=None
, col_space=None
, header=True
, index=True
, na_rep='NaN'
, formatters=None
, float_format=None
, sparsify=None
, index_names=True
, justify=None
, max_rows=None
, max_cols=None
, show_dimensions=False
, decimal='.'
, line_width=None
, min_rows=None
, max_colwidth=None
, encoding=None
)
Render a DataFrame to a console-friendly tabular output.
buf
+(str, Path or StringIO-like, optional, default None)
+— Buffer to write to. If None, the output is returned as a string.columns
+(array-like, optional, default None)
+— The subset of columns to write. Writes all columns by default.col_space
+(int, list or dict of int, optional)
+— The minimum width of each column. If a list of ints is given every integers corresponds with one column. If a dict is given, the key references the column, while the value defines the space to use..header
+(bool or list of str, optional)
+— Write out the column names. If a list of columns is given, it is assumed to be aliases for the column names.index
+(bool, optional, default True)
+— Whether to print index (row) labels.na_rep
+(str, optional, default 'NaN')
+— String representation of NaN
to use.formatters
+(list, tuple or dict of one-param. functions, optional)
+— Formatter functions to apply to columns' elements by position orname.
+The result of each function must be a unicode string.
+List/tuple must be of length equal to the number of columns.
+float_format
+(one-parameter function, optional, default None)
+— Formatter function to apply to columns' elements if they arefloats. This function must return a unicode string and will be
+applied only to the non-NaN
elements, with NaN
being
+handled by na_rep
.
+sparsify
+(bool, optional, default True)
+— Set to False for a DataFrame with a hierarchical index to printevery multiindex key at each row.
+index_names
+(bool, optional, default True)
+— Prints the names of the indexes.justify
+(str, default None)
+— How to justify the column labels. If None uses the option fromthe print configuration (controlled by set_option), 'right' out
+of the box. Valid values aremax_rows
+(int, optional)
+— Maximum number of rows to display in the console.max_cols
+(int, optional)
+— Maximum number of columns to display in the console.show_dimensions
+(bool, default False)
+— Display DataFrame dimensions (number of rows by number of columns).decimal
+(str, default '.')
+— Character recognized as decimal separator, e.g. ',' in Europe.line_width
+(int, optional)
+— Width to wrap a line in characters.min_rows
+(int, optional)
+— The number of rows to display in the console in a truncated repr(when number of rows is above max_rows
).
+max_colwidth
+(int, optional)
+— Max width to truncate each column in characters. By default, no limit.encoding
+(str, default "utf-8")
+— Set character encoding.If buf is None, returns the result as a string. Otherwise returnsNone.
+to_html : Convert DataFrame to HTML.
>>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}>>> df = pd.DataFrame(d)
+>>> print(df.to_string())
+ col1 col2
+0 1 4
+1 2 5
+2 3 6
+
items
(
)
Iterate over (column name, Series) pairs.
Iterates over the DataFrame columns, returning a tuple with +the column name and the content as a Series.
+The column names for the DataFrame being iterated over.ent : Series +The column entries belonging to each label, as a Series.
+DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. +DataFrame.itertuples : Iterate over DataFrame rows as namedtuples + of the values.
+>>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],... 'population': [1864, 22000, 80000]},
+... index=['panda', 'polar', 'koala'])
+>>> df
+ species population
+panda bear 1864
+polar bear 22000
+koala marsupial 80000
+>>> for label, content in df.items():
+... print(f'label: {label}')
+... print(f'content: {content}', sep='\n')
+...
+label: species
+content:
+panda bear
+polar bear
+koala marsupial
+Name: species, dtype: object
+label: population
+content:
+panda 1864
+polar 22000
+koala 80000
+Name: population, dtype: int64
+
iterrows
(
)
Iterate over DataFrame rows as (index, Series) pairs.
The index of the row. A tuple for a MultiIndex
. : Series
+The data of the row as a Series.
DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.DataFrame.items : Iterate over (column name, Series) pairs.
+Notes
+iterrows
returns a Series for each row,
+ it does not preserve dtypes across the rows (dtypes are
+ preserved across columns for DataFrames).To preserve dtypes while iterating over the rows, it is better
+ to use :meth:itertuples
which returns namedtuples of the values
+ and which is generally faster than iterrows
.
)] +w +0 +5 +4 +) +4 +) +4
+itertuples
(
index=True
, name='Pandas'
)
Iterate over DataFrame rows as namedtuples.
index
+(bool, default True)
+— If True, return the index as the first element of the tuple.name
+(str or None, default "Pandas")
+— The name of the returned namedtuples or None to return regulartuples.
+An object to iterate over namedtuples for each row in theDataFrame with the first field possibly being the index and +following fields being the column values.
+DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. +DataFrame.items : Iterate over (column name, Series) pairs.
+Notes
+The column names will be renamed to positional names if they are +invalid Python identifiers, repeated, or start with an underscore.
+>>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},... index=['dog', 'hawk'])
+>>> df
+ num_legs num_wings
+dog 4 0
+hawk 2 2
+>>> for row in df.itertuples():
+... print(row)
+...
+Pandas(Index='dog', num_legs=4, num_wings=0)
+Pandas(Index='hawk', num_legs=2, num_wings=2)
+
By setting the index
parameter to False we can remove the index
+as the first element of the tuple:
>>> for row in df.itertuples(index=False):
+... print(row)
+...
+Pandas(num_legs=4, num_wings=0)
+Pandas(num_legs=2, num_wings=2)
+
With the name
parameter set we set a custom name for the yielded
+namedtuples:
>>> for row in df.itertuples(name='Animal'):
+... print(row)
+...
+Animal(Index='dog', num_legs=4, num_wings=0)
+Animal(Index='hawk', num_legs=2, num_wings=2)
+
__len__
(
)
→ intReturns length of info axis, but here we use the index.
dot
(
other
)
Compute the matrix multiplication between the DataFrame and other.
This method computes the matrix product between the DataFrame and the +values of an other Series, DataFrame or a numpy array.
+It can also be called using self @ other
.
other
+(Series, DataFrame or array-like)
+— The other object to compute the matrix product with.If other is a Series, return the matrix product between self andother as a Series. If other is a DataFrame or a numpy.array, return +the matrix product of self and other in a DataFrame of a np.array.
+Series.dot: Similar method for Series.
Notes
+The dimensions of DataFrame and other must be compatible in order to +compute the matrix multiplication. In addition, the column names of +DataFrame and the index of other must contain the same values, as they +will be aligned prior to the multiplication.
+The dot method for Series computes the inner product, instead of the +matrix product here.
+Here we multiply a DataFrame with a Series.
>>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
+>>> s = pd.Series([1, 1, 2, 1])
+>>> df.dot(s)
+0 -4
+1 5
+dtype: int64
+
Here we multiply a DataFrame with another DataFrame.
+>>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
+>>> df.dot(other)
+ 0 1
+0 1 4
+1 2 2
+
Note that the dot method give the same result as @
+>>> df @ other
+ 0 1
+0 1 4
+1 2 2
+
The dot method works also if other is an np.array.
+>>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
+>>> df.dot(arr)
+ 0 1
+0 1 4
+1 2 2
+
Note how shuffling of the objects does not change the result.
+>>> s2 = s.reindex([1, 0, 2, 3])
+>>> df.dot(s2)
+0 -4
+1 5
+dtype: int64
+
__matmul__
(
other
)
→ pandas.core.frame.dataframe | pandas.core.series.seriesMatrix multiplication using binary @
operator.
__rmatmul__
(
other
)
→ DataFrameMatrix multiplication using binary @
operator.
from_dict
(
data
, orient='columns'
, dtype=None
, columns=None
)
Construct DataFrame from dict of array-like or dicts.
Creates DataFrame object from dictionary by columns or by index +allowing dtype specification.
+data
+(dict)
+— Of the form {field : array-like} or {field : dict}.orient
+({'columns', 'index', 'tight'}, default 'columns')
+— The "orientation" of the data. If the keys of the passed dictshould be the columns of the resulting DataFrame, pass 'columns'
+(default). Otherwise if the keys should be rows, pass 'index'.
+If 'tight', assume a dict with keys ['index', 'columns', 'data',
+'index_names', 'column_names'].orient
argument
+dtype
+(dtype, default None)
+— Data type to force after DataFrame construction, otherwise infer.columns
+(list, default None)
+— Column labels to use when orient='index'
. Raises a ValueErrorif used with orient='columns'
or orient='tight'
.
+DataFrame.from_records : DataFrame from structured ndarray, sequence of tuples or dicts, or DataFrame. +DataFrame : DataFrame object creation using constructor. +DataFrame.to_dict : Convert the DataFrame to a dictionary.
+By default the keys of the dict become the DataFrame columns:
>>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
+>>> pd.DataFrame.from_dict(data)
+ col_1 col_2
+0 3 a
+1 2 b
+2 1 c
+3 0 d
+
Specify orient='index'
to create the DataFrame using dictionary
+keys as rows:
>>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
+>>> pd.DataFrame.from_dict(data, orient='index')
+ 0 1 2 3
+row_1 3 2 1 0
+row_2 a b c d
+
When using the 'index' orientation, the column names can be +specified manually:
+>>> pd.DataFrame.from_dict(data, orient='index',
+... columns=['A', 'B', 'C', 'D'])
+ A B C D
+row_1 3 2 1 0
+row_2 a b c d
+
Specify orient='tight'
to create the DataFrame using a 'tight'
+format:
>>> data = {'index': [('a', 'b'), ('a', 'c')],
+... 'columns': [('x', 1), ('y', 2)],
+... 'data': [[1, 3], [2, 4]],
+... 'index_names': ['n1', 'n2'],
+... 'column_names': ['z1', 'z2']}
+>>> pd.DataFrame.from_dict(data, orient='tight')
+z1 x y
+z2 1 2
+n1 n2
+a b 1 3
+ c 2 4
+
to_numpy
(
dtype=None
, copy=False
, na_value=<no_default>
)
Convert the DataFrame to a NumPy array.
By default, the dtype of the returned array will be the common NumPy
+dtype of all types in the DataFrame. For example, if the dtypes are
+float16
and float32
, the results dtype will be float32
.
+This may require copying data and coercing values, which may be
+expensive.
dtype
+(str or numpy.dtype, optional)
+— The dtype to pass to :meth:numpy.asarray
.copy
+(bool, default False)
+— Whether to ensure that the returned value is not a view onanother array. Note that copy=False
does not ensure that
+to_numpy()
is no-copy. Rather, copy=True
ensure that
+a copy is made, even if not strictly necessary.
+na_value
+(Any, optional)
+— The value to use for missing values. The default value dependson dtype
and the dtypes of the DataFrame columns.
+Series.to_numpy : Similar method for Series.
>>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()array([[1, 3],
+ [2, 4]])
+
With heterogeneous data, the lowest common type will have to +be used.
+>>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
+>>> df.to_numpy()
+array([[1. , 3. ],
+ [2. , 4.5]])
+
For a mix of numeric and non-numeric types, the output array will +have object dtype.
+>>> df['C'] = pd.date_range('2000', periods=2)
+>>> df.to_numpy()
+array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
+ [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
+
to_dict
(
orient='dict'
, into=<class 'dict'>
, index=True
)
Convert the DataFrame to a dictionary.
The type of the key-value pairs can be customized with the parameters +(see below).
+orient
+(str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'})
+— Determines the type of the values of the dictionary.orient
argument
+into
+(class, default dict)
+— The collections.abc.MutableMapping subclass used for all Mappingsin the return value. Can be the actual class or an empty
+instance of the mapping type you want. If you want a
+collections.defaultdict, you must pass it initialized.
+index
+(bool, default True)
+— Whether to include the index item (and index_names item if orient
is 'tight') in the returned dictionary. Can only be False
+when orient
is 'split' or 'tight'.Return a collections.abc.MutableMapping object representing theDataFrame. The resulting transformation depends on the orient
+parameter.
DataFrame.from_dict: Create a DataFrame from a dictionary.DataFrame.to_json: Convert a DataFrame to JSON format.
+>>> df = pd.DataFrame({'col1': [1, 2],... 'col2': [0.5, 0.75]},
+... index=['row1', 'row2'])
+>>> df
+ col1 col2
+row1 1 0.50
+row2 2 0.75
+>>> df.to_dict()
+{'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
+
You can specify the return orientation.
+>>> df.to_dict('series')
+{'col1': row1 1
+ row2 2
+Name: col1, dtype: int64,
+'col2': row1 0.50
+ row2 0.75
+Name: col2, dtype: float64}
+
>>> df.to_dict('split')
+{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+ 'data': [[1, 0.5], [2, 0.75]]}
+
>>> df.to_dict('records')
+[{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
+
>>> df.to_dict('index')
+{'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
+
>>> df.to_dict('tight')
+{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+ 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
+
You can also specify the mapping type.
+>>> from collections import OrderedDict, defaultdict
+>>> df.to_dict(into=OrderedDict)
+OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
+ ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
+
If you want a defaultdict
, you need to initialize it:
>>> dd = defaultdict(list)
+>>> df.to_dict('records', into=dd)
+[defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
+ defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
+
to_gbq
(
destination_table
, project_id=None
, chunksize=None
, reauth=False
, if_exists='fail'
, auth_local_webserver=True
, table_schema=None
, location=None
, progress_bar=True
, credentials=None
)
Write a DataFrame to a Google BigQuery table.
.. deprecated:: 2.2.0
+Please use pandas_gbq.to_gbq
instead.
This function requires the pandas-gbq package
+<https://pandas-gbq.readthedocs.io>
__.
See the How to authenticate with Google BigQuery
+<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>
__
+guide for authentication instructions.
destination_table
+(str)
+— Name of table to be written, in the form dataset.tablename
.project_id
+(str, optional)
+— Google BigQuery Account project ID. Optional when available fromthe environment.
+chunksize
+(int, optional)
+— Number of rows to be inserted in each chunk from the dataframe.Set to None
to load the whole dataframe at once.
+reauth
+(bool, default False)
+— Force Google BigQuery to re-authenticate the user. This is usefulif multiple accounts are used.
+if_exists
+(str, default 'fail')
+— Behavior when the destination table exists. Value can be one of:'fail'
+ If table exists raise pandasgbq.gbq.TableCreationError.
+'replace'
+ If table exists, drop it, recreate it, and insert data.
+'append'
+ If table exists, insert data. Create if does not exist.
+auth_local_webserver
+(bool, default True)
+— Use the local webserver flow
instead of the console flow
when getting user credentials.True
. Google has deprecated the
+ auth_local_webserver = False
"out of band" (copy-paste)
+ flow
+ <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>
_.
+table_schema
+(list of dicts, optional)
+— List of BigQuery table fields to which according DataFramecolumns conform to, e.g. [{'name': 'col1', 'type':
+'STRING'},...]
. If schema is not provided, it will be
+generated according to dtypes of DataFrame columns. See
+BigQuery API documentation on available names of a field.location
+(str, optional)
+— Location where the load job should run. See the BigQuery locationsdocumentation
+<https://cloud.google.com/bigquery/docs/dataset-locations>
__ for a
+list of available locations. The location must match that of the
+target dataset.progress_bar
+(bool, default True)
+— Use the library tqdm
to show the progress bar for the upload,chunk by chunk.credentials
+(google.auth.credentials.Credentials, optional)
+— Credentials for accessing Google APIs. Use this parameter tooverride default credentials, such as to use Compute Engine
+:class:google.auth.compute_engine.Credentials
or Service
+Account :class:google.oauth2.service_account.Credentials
+directly.pandas_gbq.to_gbq : This function in the pandas-gbq library.read_gbq : Read a DataFrame from Google BigQuery.
+Example taken from Google BigQuery documentation<https://cloud.google.com/bigquery/docs/samples/bigquery-pandas-gbq-to-gbq-simple>
_
>>> project_id = "my-project"
+>>> table_id = 'my_dataset.my_table'
+>>> df = pd.DataFrame({
+... "my_string": ["a", "b", "c"],
+... "my_int64": [1, 2, 3],
+... "my_float64": [4.0, 5.0, 6.0],
+... "my_bool1": [True, False, True],
+... "my_bool2": [False, True, False],
+... "my_dates": pd.date_range("now", periods=3),
+... }
+... )
+
>>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP
+
from_records
(
data
, index=None
, exclude=None
, columns=None
, coerce_float=False
, nrows=None
)
Convert structured or record ndarray to DataFrame.
Creates a DataFrame object from a structured ndarray, sequence of +tuples or dicts, or DataFrame.
+data
+(structured ndarray, sequence of tuples or dicts, or DataFrame)
+— Structured input data.index
+(str, list of fields, array-like)
+— Field of array to use as the index, alternately a specific set ofinput labels to use.
+exclude
+(sequence, default None)
+— Columns or fields to exclude.columns
+(sequence, default None)
+— Column names to use. If the passed data do not have namesassociated with them, this argument provides names for the
+columns. Otherwise this argument indicates the order of the columns
+in the result (any names not found in the data will become all-NA
+columns).
+coerce_float
+(bool, default False)
+— Attempt to convert values of non-string, non-numeric objects (likedecimal.Decimal) to floating point, useful for SQL result sets.
+nrows
+(int, default None)
+— Number of rows to read if data is an iterator.DataFrame.from_dict : DataFrame from dict of array-like or dicts.DataFrame : DataFrame object creation using constructor.
+Data can be provided as a structured ndarray:
>>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
+... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
+>>> pd.DataFrame.from_records(data)
+ col_1 col_2
+0 3 a
+1 2 b
+2 1 c
+3 0 d
+
Data can be provided as a list of dicts:
+>>> data = [{'col_1': 3, 'col_2': 'a'},
+... {'col_1': 2, 'col_2': 'b'},
+... {'col_1': 1, 'col_2': 'c'},
+... {'col_1': 0, 'col_2': 'd'}]
+>>> pd.DataFrame.from_records(data)
+ col_1 col_2
+0 3 a
+1 2 b
+2 1 c
+3 0 d
+
Data can be provided as a list of tuples with corresponding columns:
+>>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
+>>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
+ col_1 col_2
+0 3 a
+1 2 b
+2 1 c
+3 0 d
+
to_records
(
index=True
, column_dtypes=None
, index_dtypes=None
)
Convert DataFrame to a NumPy record array.
Index will be included as the first field of the record array if +requested.
+index
+(bool, default True)
+— Include index in resulting record array, stored in 'index'field or using the index label, if set.
+column_dtypes
+(str, type, dict, default None)
+— If a string or type, the data type to store all columns. Ifa dictionary, a mapping of column names and indices (zero-indexed)
+to specific data types.
+index_dtypes
+(str, type, dict, default None)
+— If a string or type, the data type to store all index levels. Ifa dictionary, a mapping of index level names and indices
+(zero-indexed) to specific data types.index=True
.
+NumPy ndarray with the DataFrame labels as fields and each rowof the DataFrame as entries.
+DataFrame.from_records: Convert structured or record ndarray to DataFrame. +numpy.rec.recarray: An ndarray that allows field access using + attributes, analogous to typed columns in a + spreadsheet.
+>>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},... index=['a', 'b'])
+>>> df
+ A B
+a 1 0.50
+b 2 0.75
+>>> df.to_records()
+rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
+
If the DataFrame index has no label then the recarray field name +is set to 'index'. If the index has a label then this is used as the +field name:
+>>> df.index = df.index.rename("I")
+>>> df.to_records()
+rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+ dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
+
The index can be excluded from the record array:
+>>> df.to_records(index=False)
+rec.array([(1, 0.5 ), (2, 0.75)],
+ dtype=[('A', '<i8'), ('B', '<f8')])
+
Data types can be specified for the columns:
+>>> df.to_records(column_dtypes={"A": "int32"})
+rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+ dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
+
As well as for the index:
+>>> df.to_records(index_dtypes="<S2")
+rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+ dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
+
>>> index_dtypes = f"<S{df.index.str.len().max()}"
+>>> df.to_records(index_dtypes=index_dtypes)
+rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+ dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
+
to_stata
(
path
, convert_dates=None
, write_index=True
, byteorder=None
, time_stamp=None
, data_label=None
, variable_labels=None
, version=114
, convert_strl=None
, compression='infer'
, storage_options=None
, value_labels=None
)
Export DataFrame object to Stata dta format.
Writes the DataFrame to a Stata dataset file. +"dta" files contain a Stata dataset.
+path
+(str, path object, or buffer)
+— String, path object (implementing os.PathLike[str]
), or file-likeobject implementing a binary write()
function.
+convert_dates
+(dict)
+— Dictionary mapping columns containing datetime types to statainternal format to use when writing the dates. Options are 'tc',
+'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
+or a name. Datetime columns that do not have a conversion type
+specified will be converted to 'tc'. Raises NotImplementedError if
+a datetime column has timezone information.
+write_index
+(bool)
+— Write the index to Stata dataset.byteorder
+(str)
+— Can be ">", "<", "little", or "big". default is sys.byteorder
.time_stamp
+(datetime)
+— A datetime to use as file creation date. Default is the currenttime.
+data_label
+(str, optional)
+— A label for the data set. Must be 80 characters or smaller.variable_labels
+(dict)
+— Dictionary containing columns as keys and variable labels asvalues. Each label must be 80 characters or smaller.
+version
+({114, 117, 118, 119, None}, default 114)
+— Version to use in the output dta file. Set to None to let pandasdecide between 118 or 119 formats depending on the number of
+columns in the frame. Version 114 can be read by Stata 10 and
+later. Version 117 can be read by Stata 13 or later. Version 118
+is supported in Stata 14 and later. Version 119 is supported in
+Stata 15 and later. Version 114 limits string variables to 244
+characters or fewer while versions 117 and later allow strings
+with lengths up to 2,000,000 characters. Versions 118 and 119
+support Unicode characters, and version 119 supports more than
+32,767 variables.convert_strl
+(list, optional)
+— List of column names to convert to string columns to Stata StrLformat. Only available if version is 117. Storing strings in the
+StrL format can produce smaller dta files if strings have more than
+8 characters and values are repeated.
+compression
+(str or dict, default 'infer')
+— For on-the-fly compression of the output data. If 'infer' and 'path' ispath-like, then detect compression from the following extensions: '.gz',
+'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+(otherwise no compression).
+Set to None
for no compression.
+Can also be a dict with key 'method'
set
+to one of {'zip'
, 'gzip'
, 'bz2'
, 'zstd'
, 'xz'
, 'tar'
} and
+other key-value pairs are forwarded to
+zipfile.ZipFile
, gzip.GzipFile
,
+bz2.BZ2File
, zstandard.ZstdCompressor
, lzma.LZMAFile
or
+tarfile.TarFile
, respectively.
+As an example, the following could be passed for faster compression and to create
+a reproducible gzip archive:
+compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}
..tar
files.storage_options
+(dict, optional)
+— Extra options that make sense for a particular storage connection, e.g.host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+are forwarded to urllib.request.Request
as header options. For other
+URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+forwarded to fsspec.open
. Please see fsspec
and urllib
for more
+details, and for more examples on storage options refer here
+<https://pandas.pydata.org/docs/user_guide/io.html?
+highlight=storage_options#reading-writing-remote-files>
_.
+value_labels
+(dict of dicts)
+— Dictionary containing columns as keys and dictionaries of column valueto labels as values. Labels for a single variable must be 32,000
+characters or smaller.NotImplementedError
+
+— ValueError
+
+— read_stata : Import Stata data files.io.stata.StataWriter : Low-level writer for Stata data files. +io.stata.StataWriter117 : Low-level writer for version 117 files.
+>>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',... 'parrot'],
+... 'speed': [350, 18, 361, 15]})
+>>> df.to_stata('animals.dta') # doctest: +SKIP
+
to_feather
(
path
, **kwargs
)
Write a DataFrame to the binary Feather format.
path
+(str, path object, file-like object)
+— String, path object (implementing os.PathLike[str]
), or file-likeobject implementing a binary write()
function. If a string or a path,
+it will be used as Root Directory path when writing a partitioned dataset.
+**kwargs
+
+— Additional keywords passed to :func:pyarrow.feather.write_feather
.This includes the compression
, compression_level
, chunksize
+and version
keywords.
+Notes
+This function writes the dataframe as a feather file
+<https://arrow.apache.org/docs/python/feather.html>
_. Requires a default
+index. For saving the DataFrame with your custom index use a method that
+supports custom indices e.g. to_parquet
.
>>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])>>> df.to_feather("file.feather") # doctest: +SKIP
+
to_markdown
(
buf=None
, mode='wt'
, index=True
, storage_options=None
, **kwargs
)
Print DataFrame in Markdown-friendly format.
buf
+(str, Path or StringIO-like, optional, default None)
+— Buffer to write to. If None, the output is returned as a string.mode
+(str, optional)
+— Mode in which file is opened, "wt" by default.index
+(bool, optional, default True)
+— Add index (row) labels.storage_options
+(dict, optional)
+— Extra options that make sense for a particular storage connection, e.g.host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+are forwarded to urllib.request.Request
as header options. For other
+URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+forwarded to fsspec.open
. Please see fsspec
and urllib
for more
+details, and for more examples on storage options refer here
+<https://pandas.pydata.org/docs/user_guide/io.html?
+highlight=storage_options#reading-writing-remote-files>
_.
+**kwargs
+
+— These parameters will be passed to tabulate <https://pypi.org/project/tabulate>
_.DataFrame in Markdown-friendly format.
Notes
+Requires the tabulate <https://pypi.org/project/tabulate>
_ package.
>>> df = pd.DataFrame(... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
+... )
+>>> print(df.to_markdown())
+| | animal_1 | animal_2 |
+|---:|:-----------|:-----------|
+| 0 | elk | dog |
+| 1 | pig | quetzal |
+
Output markdown with a tabulate option.
+>>> print(df.to_markdown(tablefmt="grid"))
++----+------------+------------+
+| | animal_1 | animal_2 |
++====+============+============+
+| 0 | elk | dog |
++----+------------+------------+
+| 1 | pig | quetzal |
++----+------------+------------+
+
to_parquet
(
path=None
, engine='auto'
, compression='snappy'
, index=None
, partition_cols=None
, storage_options=None
, **kwargs
)
Write a DataFrame to the binary parquet format.
This function writes the dataframe as a parquet file
+<https://parquet.apache.org/>
_. You can choose different parquet
+backends, and have the option of compression. See
+:ref:the user guide <io.parquet>
for more details.
path
+(str, path object, file-like object, or None, default None)
+— String, path object (implementing os.PathLike[str]
), or file-likeobject implementing a binary write()
function. If None, the result is
+returned as bytes. If a string or path, it will be used as Root Directory
+path when writing a partitioned dataset.
+engine
+({'auto', 'pyarrow', 'fastparquet'}, default 'auto')
+— Parquet library to use. If 'auto', then the optionio.parquet.engine
is used. The default io.parquet.engine
+behavior is to try 'pyarrow', falling back to 'fastparquet' if
+'pyarrow' is unavailable.
+compression
+(str or None, default 'snappy')
+— Name of the compression to use. Use None
for no compression.Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'.
+index
+(bool, default None)
+— If True
, include the dataframe's index(es) in the file output.If False
, they will not be written to the file.
+If None
, similar to True
the dataframe's index(es)
+will be saved. However, instead of being saved as values,
+the RangeIndex will be stored as a range in the metadata so it
+doesn't require much space and is faster. Other indexes will
+be included as columns in the file output.
+partition_cols
+(list, optional, default None)
+— Column names by which to partition the dataset.Columns are partitioned in the order they are given.
+Must be None if path is not a string.
+storage_options
+(dict, optional)
+— Extra options that make sense for a particular storage connection, e.g.host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+are forwarded to urllib.request.Request
as header options. For other
+URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+forwarded to fsspec.open
. Please see fsspec
and urllib
for more
+details, and for more examples on storage options refer here
+<https://pandas.pydata.org/docs/user_guide/io.html?
+highlight=storage_options#reading-writing-remote-files>
_.
+**kwargs
+
+— Additional arguments passed to the parquet library. See:ref:pandas io <io.parquet>
for more details.
+read_parquet : Read a parquet file.DataFrame.to_orc : Write an orc file. +DataFrame.to_csv : Write a csv file. +DataFrame.to_sql : Write to a sql table. +DataFrame.to_hdf : Write to hdf.
+Notes
+This function requires either the fastparquet
+<https://pypi.org/project/fastparquet>
or pyarrow
+<https://arrow.apache.org/docs/python/>
library.
>>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})>>> df.to_parquet('df.parquet.gzip',
+... compression='gzip') # doctest: +SKIP
+>>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
+ col1 col2
+0 1 3
+1 2 4
+
If you want to get a buffer to the parquet content you can use a io.BytesIO +object, as long as you don't use partition_cols, which creates multiple files.
+>>> import io
+>>> f = io.BytesIO()
+>>> df.to_parquet(f)
+>>> f.seek(0)
+0
+>>> content = f.read()
+
to_orc
(
path=None
, engine='pyarrow'
, index=None
, engine_kwargs=None
)
Write a DataFrame to the ORC format.
.. versionadded:: 1.5.0
+path
+(str, file-like object or None, default None)
+— If a string, it will be used as Root Directory pathwhen writing a partitioned dataset. By file-like object,
+we refer to objects with a write() method, such as a file handle
+(e.g. via builtin open function). If path is None,
+a bytes object is returned.
+engine
+({'pyarrow'}, default 'pyarrow')
+— ORC library to use.index
+(bool, optional)
+— If True
, include the dataframe's index(es) in the file output.If False
, they will not be written to the file.
+If None
, similar to infer
the dataframe's index(es)
+will be saved. However, instead of being saved as values,
+the RangeIndex will be stored as a range in the metadata so it
+doesn't require much space and is faster. Other indexes will
+be included as columns in the file output.
+engine_kwargs
+(dict[str, Any] or None, default None)
+— Additional keyword arguments passed to :func:pyarrow.orc.write_table
.NotImplementedError
+
+— Dtype of one or more columns is category, unsigned integers, interval,period or sparse.
+ValueError
+
+— engine is not pyarrow.read_orc : Read a ORC file.DataFrame.to_parquet : Write a parquet file. +DataFrame.to_csv : Write a csv file. +DataFrame.to_sql : Write to a sql table. +DataFrame.to_hdf : Write to hdf.
+Notes
+user guide about
+ ORC <io.orc>
and :ref:install optional dependencies <install.warn_orc>
.pyarrow <https://arrow.apache.org/docs/python/>
_
+ library.supported ORC features in Arrow
+ <https://arrow.apache.org/docs/cpp/orc.html#data-types>
__.>>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})>>> df.to_orc('df.orc') # doctest: +SKIP
+>>> pd.read_orc('df.orc') # doctest: +SKIP
+ col1 col2
+0 1 4
+1 2 3
+
If you want to get a buffer to the orc content you can write it to io.BytesIO
+>>> import io
+>>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
+>>> b.seek(0) # doctest: +SKIP
+0
+>>> content = b.read() # doctest: +SKIP
+
to_html
(
buf=None
, columns=None
, col_space=None
, header=True
, index=True
, na_rep='NaN'
, formatters=None
, float_format=None
, sparsify=None
, index_names=True
, justify=None
, max_rows=None
, max_cols=None
, show_dimensions=False
, decimal='.'
, bold_rows=True
, classes=None
, escape=True
, notebook=False
, border=None
, table_id=None
, render_links=False
, encoding=None
)
Render a DataFrame as an HTML table.
buf
+(str, Path or StringIO-like, optional, default None)
+— Buffer to write to. If None, the output is returned as a string.columns
+(array-like, optional, default None)
+— The subset of columns to write. Writes all columns by default.col_space
+(str or int, list or dict of int or str, optional)
+— The minimum width of each column in CSS length units. An int is assumed to be px units..header
+(bool, optional)
+— Whether to print column labels, default True.index
+(bool, optional, default True)
+— Whether to print index (row) labels.na_rep
+(str, optional, default 'NaN')
+— String representation of NaN
to use.formatters
+(list, tuple or dict of one-param. functions, optional)
+— Formatter functions to apply to columns' elements by position orname.
+The result of each function must be a unicode string.
+List/tuple must be of length equal to the number of columns.
+float_format
+(one-parameter function, optional, default None)
+— Formatter function to apply to columns' elements if they arefloats. This function must return a unicode string and will be
+applied only to the non-NaN
elements, with NaN
being
+handled by na_rep
.
+sparsify
+(bool, optional, default True)
+— Set to False for a DataFrame with a hierarchical index to printevery multiindex key at each row.
+index_names
+(bool, optional, default True)
+— Prints the names of the indexes.justify
+(str, default None)
+— How to justify the column labels. If None uses the option fromthe print configuration (controlled by set_option), 'right' out
+of the box. Valid values aremax_rows
+(int, optional)
+— Maximum number of rows to display in the console.max_cols
+(int, optional)
+— Maximum number of columns to display in the console.show_dimensions
+(bool, default False)
+— Display DataFrame dimensions (number of rows by number of columns).decimal
+(str, default '.')
+— Character recognized as decimal separator, e.g. ',' in Europe.bold_rows
+(bool, default True)
+— Make the row labels bold in the output.classes
+(str or list or tuple, default None)
+— CSS class(es) to apply to the resulting html table.escape
+(bool, default True)
+— Convert the characters <, >, and & to HTML-safe sequences.notebook
+({True, False}, default False)
+— Whether the generated HTML is for IPython Notebook.border
+(int)
+— A border=border
attribute is included in the opening<table>
tag. Default pd.options.display.html.border
.
+table_id
+(str, optional)
+— A css id is included in the opening <table>
tag if specified.render_links
+(bool, default False)
+— Convert URLs to HTML links.encoding
+(str, default "utf-8")
+— Set character encoding.If buf is None, returns the result as a string. Otherwise returnsNone.
+to_string : Convert DataFrame to a string.
>>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})>>> html_string = '''<table border="1" class="dataframe">
+... <thead>
+... <tr style="text-align: right;">
+... <th></th>
+... <th>col1</th>
+... <th>col2</th>
+... </tr>
+... </thead>
+... <tbody>
+... <tr>
+... <th>0</th>
+... <td>1</td>
+... <td>4</td>
+... </tr>
+... <tr>
+... <th>1</th>
+... <td>2</td>
+... <td>3</td>
+... </tr>
+... </tbody>
+... </table>'''
+>>> assert html_string == df.to_html()
+
to_xml
(
path_or_buffer=None
, index=True
, root_name='data'
, row_name='row'
, na_rep=None
, attr_cols=None
, elem_cols=None
, namespaces=None
, prefix=None
, encoding='utf-8'
, xml_declaration=True
, pretty_print=True
, parser='lxml'
, stylesheet=None
, compression='infer'
, storage_options=None
)
Render a DataFrame to an XML document.
.. versionadded:: 1.3.0
+path_or_buffer
+(str, path object, file-like object, or None, default None)
+— String, path object (implementing os.PathLike[str]
), or file-likeobject implementing a write()
function. If None, the result is returned
+as a string.
+index
+(bool, default True)
+— Whether to include index in XML document.root_name
+(str, default 'data')
+— The name of root element in XML document.row_name
+(str, default 'row')
+— The name of row element in XML document.na_rep
+(str, optional)
+— Missing data representation.attr_cols
+(list-like, optional)
+— List of columns to write as attributes in row element.Hierarchical columns will be flattened with underscore
+delimiting the different levels.
+elem_cols
+(list-like, optional)
+— List of columns to write as children in row element. By default,all columns output as children of row element. Hierarchical
+columns will be flattened with underscore delimiting the
+different levels.
+namespaces
+(dict, optional)
+— All namespaces to be defined in root element. Keys of dictshould be prefix names and values of dict corresponding URIs.
+Default namespaces should be given empty string key. For
+example, ::namespaces = {"": "https://example.com"}
+
+prefix
+(str, optional)
+— Namespace prefix to be used for every element and/or attributein document. This should be one of the keys in namespaces
+dict.
+encoding
+(str, default 'utf-8')
+— Encoding of the resulting document.xml_declaration
+(bool, default True)
+— Whether to include the XML declaration at start of document.pretty_print
+(bool, default True)
+— Whether output should be pretty printed with indentation andline breaks.
+parser
+({'lxml','etree'}, default 'lxml')
+— Parser module to use for building of tree. Only 'lxml' and'etree' are supported. With 'lxml', the ability to use XSLT
+stylesheet is supported.
+stylesheet
+(str, path object or file-like object, optional)
+— A URL, file-like object, or a raw string containing an XSLTscript used to transform the raw XML output. Script should use
+layout of elements and attributes from original output. This
+argument requires lxml
to be installed. Only XSLT 1.0
+scripts and not later versions is currently supported.
+compression
+(str or dict, default 'infer')
+— For on-the-fly compression of the output data. If 'infer' and 'path_or_buffer' ispath-like, then detect compression from the following extensions: '.gz',
+'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+(otherwise no compression).
+Set to None
for no compression.
+Can also be a dict with key 'method'
set
+to one of {'zip'
, 'gzip'
, 'bz2'
, 'zstd'
, 'xz'
, 'tar'
} and
+other key-value pairs are forwarded to
+zipfile.ZipFile
, gzip.GzipFile
,
+bz2.BZ2File
, zstandard.ZstdCompressor
, lzma.LZMAFile
or
+tarfile.TarFile
, respectively.
+As an example, the following could be passed for faster compression and to create
+a reproducible gzip archive:
+compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}
..tar
files.storage_options
+(dict, optional)
+— Extra options that make sense for a particular storage connection, e.g.host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+are forwarded to urllib.request.Request
as header options. For other
+URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+forwarded to fsspec.open
. Please see fsspec
and urllib
for more
+details, and for more examples on storage options refer here
+<https://pandas.pydata.org/docs/user_guide/io.html?
+highlight=storage_options#reading-writing-remote-files>
_.
+If io
is None, returns the resulting XML format as astring. Otherwise returns None.
to_json : Convert the pandas object to a JSON string.to_html : Convert DataFrame to a html.
+>>> df = pd.DataFrame({'shape': ['square', 'circle', 'triangle'],... 'degrees': [360, 360, 180],
+... 'sides': [4, np.nan, 3]})
+
>>> df.to_xml() # doctest: +SKIP
+<?xml version='1.0' encoding='utf-8'?>
+<data>
+ <row>
+ <index>0</index>
+ <shape>square</shape>
+ <degrees>360</degrees>
+ <sides>4.0</sides>
+ </row>
+ <row>
+ <index>1</index>
+ <shape>circle</shape>
+ <degrees>360</degrees>
+ <sides/>
+ </row>
+ <row>
+ <index>2</index>
+ <shape>triangle</shape>
+ <degrees>180</degrees>
+ <sides>3.0</sides>
+ </row>
+</data>
+
>>> df.to_xml(attr_cols=[
+... 'index', 'shape', 'degrees', 'sides'
+... ]) # doctest: +SKIP
+<?xml version='1.0' encoding='utf-8'?>
+<data>
+ <row index="0" shape="square" degrees="360" sides="4.0"/>
+ <row index="1" shape="circle" degrees="360"/>
+ <row index="2" shape="triangle" degrees="180" sides="3.0"/>
+</data>
+
>>> df.to_xml(namespaces={"doc": "https://example.com"},
+... prefix="doc") # doctest: +SKIP
+<?xml version='1.0' encoding='utf-8'?>
+<doc:data xmlns:doc="https://example.com">
+ <doc:row>
+ <doc:index>0</doc:index>
+ <doc:shape>square</doc:shape>
+ <doc:degrees>360</doc:degrees>
+ <doc:sides>4.0</doc:sides>
+ </doc:row>
+ <doc:row>
+ <doc:index>1</doc:index>
+ <doc:shape>circle</doc:shape>
+ <doc:degrees>360</doc:degrees>
+ <doc:sides/>
+ </doc:row>
+ <doc:row>
+ <doc:index>2</doc:index>
+ <doc:shape>triangle</doc:shape>
+ <doc:degrees>180</doc:degrees>
+ <doc:sides>3.0</doc:sides>
+ </doc:row>
+</doc:data>
+
info
(
verbose=None
, buf=None
, max_cols=None
, memory_usage=None
, show_counts=None
)
Print a concise summary of a DataFrame.
This method prints information about a DataFrame including +the index dtype and columns, non-null values and memory usage.
+verbose
+(bool, optional)
+— Whether to print the full summary. By default, the setting inpandas.options.display.max_info_columns
is followed.
+buf
+(writable buffer, defaults to sys.stdout)
+— Where to send the output. By default, the output is printed tosys.stdout. Pass a writable buffer if you need to further process
+the output.
+max_cols
+(int, optional)
+— When to switch from the verbose to the truncated output. If theDataFrame has more than max_cols
columns, the truncated output
+is used. By default, the setting in
+pandas.options.display.max_info_columns
is used.
+memory_usage
+(bool, str, optional)
+— Specifies whether total memory usage of the DataFrameelements (including the index) should be displayed. By default,
+this follows the pandas.options.display.memory_usage
setting.Frequently Asked Questions <df-memory-usage>
for more
+details.
+show_counts
+(bool, optional)
+— Whether to show the non-null counts. By default, this is shownonly if the DataFrame is smaller than
+pandas.options.display.max_info_rows
and
+pandas.options.display.max_info_columns
. A value of True always
+shows the counts, and False never shows the counts.
+This method prints a summary of a DataFrame and returns None.
DataFrame.describe: Generate descriptive statistics of DataFrame columns. +DataFrame.memory_usage: Memory usage of DataFrame columns.
+>>> int_values = [1, 2, 3, 4, 5]>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
+>>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
+>>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
+... "float_col": float_values})
+>>> df
+ int_col text_col float_col
+0 1 alpha 0.00
+1 2 beta 0.25
+2 3 gamma 0.50
+3 4 delta 0.75
+4 5 epsilon 1.00
+
Prints information of all columns:
+>>> df.info(verbose=True)
+<class 'pandas.core.frame.DataFrame'>
+RangeIndex: 5 entries, 0 to 4
+Data columns (total 3 columns):
+ # Column Non-Null Count Dtype
+--- ------ -------------- -----
+ 0 int_col 5 non-null int64
+ 1 text_col 5 non-null object
+ 2 float_col 5 non-null float64
+dtypes: float64(1), int64(1), object(1)
+memory usage: 248.0+ bytes
+
Prints a summary of columns count and its dtypes but not per column +information:
+>>> df.info(verbose=False)
+<class 'pandas.core.frame.DataFrame'>
+RangeIndex: 5 entries, 0 to 4
+Columns: 3 entries, int_col to float_col
+dtypes: float64(1), int64(1), object(1)
+memory usage: 248.0+ bytes
+
Pipe output of DataFrame.info to buffer instead of sys.stdout, get +buffer content and writes to a text file:
+>>> import io
+>>> buffer = io.StringIO()
+>>> df.info(buf=buffer)
+>>> s = buffer.getvalue()
+>>> with open("df_info.txt", "w",
+... encoding="utf-8") as f: # doctest: +SKIP
+... f.write(s)
+260
+
The memory_usage
parameter allows deep introspection mode, specially
+useful for big DataFrames and fine-tune memory optimization:
>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
+>>> df = pd.DataFrame({
+... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
+... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
+... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
+... })
+>>> df.info()
+<class 'pandas.core.frame.DataFrame'>
+RangeIndex: 1000000 entries, 0 to 999999
+Data columns (total 3 columns):
+ # Column Non-Null Count Dtype
+--- ------ -------------- -----
+ 0 column_1 1000000 non-null object
+ 1 column_2 1000000 non-null object
+ 2 column_3 1000000 non-null object
+dtypes: object(3)
+memory usage: 22.9+ MB
+
>>> df.info(memory_usage='deep')
+<class 'pandas.core.frame.DataFrame'>
+RangeIndex: 1000000 entries, 0 to 999999
+Data columns (total 3 columns):
+ # Column Non-Null Count Dtype
+--- ------ -------------- -----
+ 0 column_1 1000000 non-null object
+ 1 column_2 1000000 non-null object
+ 2 column_3 1000000 non-null object
+dtypes: object(3)
+memory usage: 165.9 MB
+
memory_usage
(
index=True
, deep=False
)
Return the memory usage of each column in bytes.
The memory usage can optionally include the contribution of
+the index and elements of object
dtype.
This value is displayed in DataFrame.info
by default. This can be
+suppressed by setting pandas.options.display.memory_usage
to False.
index
+(bool, default True)
+— Specifies whether to include the memory usage of the DataFrame'sindex in returned Series. If index=True
, the memory usage of
+the index is the first item in the output.
+deep
+(bool, default False)
+— If True, introspect the data deeply by interrogatingobject
dtypes for system-level memory consumption, and include
+it in the returned values.
+A Series whose index is the original column names and whose valuesis the memory usage of each column in bytes.
+numpy.ndarray.nbytes : Total bytes consumed by the elements of an ndarray. +Series.memory_usage : Bytes consumed by a Series. +Categorical : Memory-efficient array for string values with + many repeated values. +DataFrame.info : Concise summary of a DataFrame.
+Notes
+See the :ref:Frequently Asked Questions <df-memory-usage>
for more
+details.
>>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']>>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))
+... for t in dtypes])
+>>> df = pd.DataFrame(data)
+>>> df.head()
+ int64 float64 complex128 object bool
+0 1 1.0 1.0+0.0j 1 True
+1 1 1.0 1.0+0.0j 1 True
+2 1 1.0 1.0+0.0j 1 True
+3 1 1.0 1.0+0.0j 1 True
+4 1 1.0 1.0+0.0j 1 True
+
>>> df.memory_usage()
+Index 128
+int64 40000
+float64 40000
+complex128 80000
+object 40000
+bool 5000
+dtype: int64
+
>>> df.memory_usage(index=False)
+int64 40000
+float64 40000
+complex128 80000
+object 40000
+bool 5000
+dtype: int64
+
The memory footprint of object
dtype columns is ignored by default:
>>> df.memory_usage(deep=True)
+Index 128
+int64 40000
+float64 40000
+complex128 80000
+object 180000
+bool 5000
+dtype: int64
+
Use a Categorical for efficient storage of an object-dtype column with +many repeated values.
+>>> df['object'].astype('category').memory_usage(deep=True)
+5244
+
transpose
(
*args
, copy=False
)
Transpose index and columns.
Reflect the DataFrame over its main diagonal by writing rows as columns
+and vice-versa. The property :attr:.T
is an accessor to the method
+:meth:transpose
.
*args
+(tuple, optional)
+— Accepted for compatibility with NumPy.copy
+(bool, default False)
+— Whether to copy the data after transposing, even for DataFrameswith a single dtype.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+The transposed DataFrame.
numpy.transpose : Permute the dimensions of a given array.
Notes
+Transposing a DataFrame with mixed dtypes will result in a homogeneous
+DataFrame with the object
dtype. In such a case, a copy of the data
+is always made.
Square DataFrame with homogeneous dtype
>>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
+>>> df1 = pd.DataFrame(data=d1)
+>>> df1
+ col1 col2
+0 1 3
+1 2 4
+
>>> df1_transposed = df1.T # or df1.transpose()
+>>> df1_transposed
+ 0 1
+col1 1 2
+col2 3 4
+
When the dtype is homogeneous in the original DataFrame, we get a +transposed DataFrame with the same dtype:
+>>> df1.dtypes
+col1 int64
+col2 int64
+dtype: object
+>>> df1_transposed.dtypes
+0 int64
+1 int64
+dtype: object
+
Non-square DataFrame with mixed dtypes
+>>> d2 = {'name': ['Alice', 'Bob'],
+... 'score': [9.5, 8],
+... 'employed': [False, True],
+... 'kids': [0, 0]}
+>>> df2 = pd.DataFrame(data=d2)
+>>> df2
+ name score employed kids
+0 Alice 9.5 False 0
+1 Bob 8.0 True 0
+
>>> df2_transposed = df2.T # or df2.transpose()
+>>> df2_transposed
+ 0 1
+name Alice Bob
+score 9.5 8.0
+employed False True
+kids 0 0
+
When the DataFrame has mixed dtypes, we get a transposed DataFrame with
+the object
dtype:
>>> df2.dtypes
+name object
+score float64
+employed bool
+kids int64
+dtype: object
+>>> df2_transposed.dtypes
+0 object
+1 object
+dtype: object
+
isetitem
(
loc
, value
)
Set the given value in the column with position loc
.
This is a positional analogue to __setitem__
.
loc
+(int or sequence of ints)
+— Index position for the column.value
+(scalar or arraylike)
+— Value(s) for the column.Notes
+frame.isetitem(loc, value)
is an in-place method as it will
+modify the DataFrame in place (not returning a new object). In contrast to
+frame.iloc[:, i] = value
which will try to update the existing values in
+place, frame.isetitem(loc, value)
will not update the values of the column
+itself in place, it will instead insert a new array.
In cases where frame.columns
is unique, this is equivalent to
+frame[frame.columns[i]] = value
.
query
(
expr
, inplace=False
, **kwargs
)
Query the columns of a DataFrame with a boolean expression.
expr
+(str)
+— The query string to evaluate.@a + b
.Area (cm^2)
). Column names which are Python keywords
+(like "list", "for", "import", etc) cannot be used.a a
and you want
+to sum it with b
, your query should be `a a` + b
.
+inplace
+(bool)
+— Whether to modify the DataFrame rather than creating a new one.**kwargs
+
+— See the documentation for :func:eval
for complete detailson the keyword arguments accepted by :meth:DataFrame.query
.
+DataFrame resulting from the provided query expression orNone if inplace=True
.
eval : Evaluate a string describing operations on DataFrame columns. +DataFrame.eval : Evaluate a string describing operations on + DataFrame columns.
+Notes
+The result of the evaluation of this expression is first passed to
+:attr:DataFrame.loc
and if that fails because of a
+multidimensional key (e.g., a DataFrame) then the result will be passed
+to :meth:DataFrame.__getitem__
.
This method uses the top-level :func:eval
function to
+evaluate the passed query.
The :meth:~pandas.DataFrame.query
method uses a slightly
+modified Python syntax by default. For example, the &
and |
+(bitwise) operators have the precedence of their boolean cousins,
+:keyword:and
and :keyword:or
. This is syntactically valid Python,
+however the semantics are different.
You can change the semantics of the expression by passing the keyword
+argument parser='python'
. This enforces the same semantics as
+evaluation in Python space. Likewise, you can pass engine='python'
+to evaluate an expression using Python itself as a backend. This is not
+recommended as it is inefficient compared to using numexpr
as the
+engine.
The :attr:DataFrame.index
and
+:attr:DataFrame.columns
attributes of the
+:class:~pandas.DataFrame
instance are placed in the query namespace
+by default, which allows you to treat both the index and columns of the
+frame as a column in the frame.
+The identifier index
is used for the frame index; you can also
+use the name of the index to identify it in a query. Please note that
+Python keywords may not be used as identifiers.
For further details and examples see the query
documentation in
+:ref:indexing <indexing.query>
.
Backtick quoted variables
+Backtick quoted variables are parsed as literal Python code and +are converted internally to a Python valid identifier. +This can lead to the following problems.
+During parsing a number of disallowed characters inside the backtick +quoted string are replaced by strings that are allowed as a Python identifier. +These characters include all operators in Python, the space character, the +question mark, the exclamation mark, the dollar sign, and the euro sign. +For other characters that fall outside the ASCII range (U+0001..U+007F) +and those that are not further specified in PEP 3131, +the query parser will raise an error. +This excludes whitespace different than the space character, +but also the hashtag (as it is used for comments) and the backtick +itself (backtick can also not be escaped).
+In a special case, quotes that make a pair around a backtick can
+confuse the parser.
+For example, it's` > `that's
will raise an error,
+as it forms a quoted string ('s > `that'
) with a backtick inside.
See also the Python documentation about lexical analysis
+(https://docs.python.org/3/reference/lexical_analysis.html)
+in combination with the source code in :mod:pandas.core.computation.parsing
.
>>> df = pd.DataFrame({'A': range(1, 6),... 'B': range(10, 0, -2),
+... 'C C': range(10, 5, -1)})
+>>> df
+ A B C C
+0 1 10 10
+1 2 8 9
+2 3 6 8
+3 4 4 7
+4 5 2 6
+>>> df.query('A > B')
+ A B C C
+4 5 2 6
+
The previous expression is equivalent to
+>>> df[df.A > df.B]
+ A B C C
+4 5 2 6
+
For columns with spaces in their name, you can use backtick quoting.
+>>> df.query('B == `C C`')
+ A B C C
+0 1 10 10
+
The previous expression is equivalent to
+>>> df[df.B == df['C C']]
+ A B C C
+0 1 10 10
+
eval
(
expr
, inplace=False
, **kwargs
)
Evaluate a string describing operations on DataFrame columns.
Operates on columns only, not specific rows or elements. This allows
+eval
to run arbitrary code, which can make you vulnerable to code
+injection if you pass user input to this function.
expr
+(str)
+— The expression string to evaluate.inplace
+(bool, default False)
+— If the expression contains an assignment, whether to perform theoperation inplace and mutate the existing DataFrame. Otherwise,
+a new DataFrame is returned.
+**kwargs
+
+— See the documentation for :func:eval
for complete detailson the keyword arguments accepted by
+:meth:~pandas.DataFrame.query
.
+The result of the evaluation or None if inplace=True
.
DataFrame.query : Evaluates a boolean expression to query the columns of a frame. +DataFrame.assign : Can evaluate an expression or function to create new + values for a column. +eval : Evaluate a Python expression as a string using various + backends.
+Notes
+For more details see the API documentation for :func:~eval
.
+For detailed examples see :ref:enhancing performance with eval
+<enhancingperf.eval>
.
>>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})>>> df
+ A B
+0 1 10
+1 2 8
+2 3 6
+3 4 4
+4 5 2
+>>> df.eval('A + B')
+0 11
+1 10
+2 9
+3 8
+4 7
+dtype: int64
+
Assignment is allowed though by default the original DataFrame is not +modified.
+>>> df.eval('C = A + B')
+ A B C
+0 1 10 11
+1 2 8 10
+2 3 6 9
+3 4 4 8
+4 5 2 7
+>>> df
+ A B
+0 1 10
+1 2 8
+2 3 6
+3 4 4
+4 5 2
+
Multiple columns can be assigned to using multi-line expressions:
+>>> df.eval(
+... '''
+... C = A + B
+... D = A - B
+... '''
+... )
+ A B C D
+0 1 10 11 -9
+1 2 8 10 -6
+2 3 6 9 -3
+3 4 4 8 0
+4 5 2 7 3
+
select_dtypes
(
include=None
, exclude=None
)
Return a subset of the DataFrame's columns based on the column dtypes.
The subset of the frame including the dtypes in include
andexcluding the dtypes in exclude
.
ValueError
+
+— include
and exclude
are emptyinclude
and exclude
have overlapping elementsDataFrame.dtypes: Return Series with the data type of each column.
Notes
+np.number
or 'number'
object
dtype, but note that
+ this will return all object dtype columnsnumpy dtype hierarchy
+ <https://numpy.org/doc/stable/reference/arrays.scalars.html>
__np.datetime64
, 'datetime'
or
+ 'datetime64'
np.timedelta64
, 'timedelta'
or
+ 'timedelta64'
'category'
'datetimetz'
+ or 'datetime64[ns, tz]'
>>> df = pd.DataFrame({'a': [1, 2] * 3,... 'b': [True, False] * 3,
+... 'c': [1.0, 2.0] * 3})
+>>> df
+ a b c
+0 1 True 1.0
+1 2 False 2.0
+2 1 True 1.0
+3 2 False 2.0
+4 1 True 1.0
+5 2 False 2.0
+
>>> df.select_dtypes(include='bool')
+ b
+0 True
+1 False
+2 True
+3 False
+4 True
+5 False
+
>>> df.select_dtypes(include=['float64'])
+ c
+0 1.0
+1 2.0
+2 1.0
+3 2.0
+4 1.0
+5 2.0
+
>>> df.select_dtypes(exclude=['int64'])
+ b c
+0 True 1.0
+1 False 2.0
+2 True 1.0
+3 False 2.0
+4 True 1.0
+5 False 2.0
+
insert
(
loc
, column
, value
, allow_duplicates=<no_default>
)
Insert column into DataFrame at specified location.
Raises a ValueError if column
is already contained in the DataFrame,
+unless allow_duplicates
is set to True.
loc
+(int)
+— Insertion index. Must verify 0 <= loc <= len(columns).column
+(str, number, or hashable object)
+— Label of the inserted column.value
+(Scalar, Series, or array-like)
+— Content of the inserted column.allow_duplicates
+(bool, optional, default lib.no_default)
+— Allow duplicate column labels to be created.Index.insert : Insert new item by index.
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})>>> df
+ col1 col2
+0 1 3
+1 2 4
+>>> df.insert(1, "newcol", [99, 99])
+>>> df
+ col1 newcol col2
+0 1 99 3
+1 2 99 4
+>>> df.insert(0, "col1", [100, 100], allow_duplicates=True)
+>>> df
+ col1 col1 newcol col2
+0 100 1 99 3
+1 100 2 99 4
+
Notice that pandas uses index alignment in case of value
from type Series
:
>>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))
+>>> df
+ col0 col1 col1 newcol col2
+0 NaN 100 1 99 3
+1 5.0 100 2 99 4
+
assign
(
**kwargs
)
Assign new columns to a DataFrame.
Returns a new object with all original columns in addition to new ones. +Existing columns that are re-assigned will be overwritten.
+**kwargs
+(dict of {str: callable or Series})
+— The column names are keywords. If the values arecallable, they are computed on the DataFrame and
+assigned to the new columns. The callable must not
+change input DataFrame (though pandas doesn't check it).
+If the values are not callable, (e.g. a Series, scalar, or array),
+they are simply assigned.
+A new DataFrame with the new columns in addition toall the existing columns.
+Notes
+Assigning multiple columns within the same assign
is possible.
+Later items in '**kwargs' may refer to newly created or modified
+columns in 'df'; items are computed and assigned into 'df' in order.
>>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},... index=['Portland', 'Berkeley'])
+>>> df
+ temp_c
+Portland 17.0
+Berkeley 25.0
+
Where the value is a callable, evaluated on df
:
>>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
+ temp_c temp_f
+Portland 17.0 62.6
+Berkeley 25.0 77.0
+
Alternatively, the same behavior can be achieved by directly +referencing an existing Series or sequence:
+>>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
+ temp_c temp_f
+Portland 17.0 62.6
+Berkeley 25.0 77.0
+
You can create multiple columns within the same assign where one +of the columns depends on another one defined within the same assign:
+>>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
+... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
+ temp_c temp_f temp_k
+Portland 17.0 62.6 290.15
+Berkeley 25.0 77.0 298.15
+
set_axis
(
labels
, axis=0
, copy=None
)
Assign desired index to given axis.
Indexes for column or row labels can be changed by assigning +a list-like or Index.
+labels
+(list-like, Index)
+— The values for the new index.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to update. The value 0 identifies the rows. For Series
this parameter is unused and defaults to 0.
+copy
+(bool, default True)
+— Whether to make a copy of the underlying data.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+An object of type DataFrame.
DataFrame.renameaxis : Alter the name of the index or columns.
Examples
+ --------
+
+~~~python
+++++++df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) +~~~
+
Change the row labels.
+
+>>> df.set_axis(['a', 'b', 'c'], axis='index')
+ A B
+a 1 4
+b 2 5
+c 3 6
+
Change the column labels.
+
+>>> df.set_axis(['I', 'II'], axis='columns')
+ I II
+0 1 4
+1 2 5
+2 3 6
+
reindex
(
labels=None
, index=None
, columns=None
, axis=None
, method=None
, copy=None
, level=None
, fill_value=nan
, limit=None
, tolerance=None
)
Conform DataFrame to new index with optional filling logic.
Places NA/NaN in locations having no value in the previous index. A new object
+is produced unless the new index is equivalent to the current one and
+copy=False
.
DataFrame.set_index : Set row labels.DataFrame.reset_index : Remove row labels or move them to new columns. +DataFrame.reindexlike : Change to same indices as other DataFrame.
+DataFrame.reindex
supports two calling conventions
(index=index_labels, columns=column_labels, ...)
(labels, axis={'index', 'columns'}, ...)
We highly recommend using keyword arguments to clarify your +intent.
+Create a dataframe with some fictional data.
+>>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
+>>> df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301],
+... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
+... index=index)
+>>> df
+ http_status response_time
+Firefox 200 0.04
+Chrome 200 0.02
+Safari 404 0.07
+IE10 404 0.08
+Konqueror 301 1.00
+
Create a new index and reindex the dataframe. By default
+values in the new index that do not have corresponding
+records in the dataframe are assigned NaN
.
>>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
+... 'Chrome']
+>>> df.reindex(new_index)
+ http_status response_time
+Safari 404.0 0.07
+Iceweasel NaN NaN
+Comodo Dragon NaN NaN
+IE10 404.0 0.08
+Chrome 200.0 0.02
+
We can fill in the missing values by passing a value to
+the keyword fill_value
. Because the index is not monotonically
+increasing or decreasing, we cannot use arguments to the keyword
+method
to fill the NaN
values.
>>> df.reindex(new_index, fill_value=0)
+ http_status response_time
+Safari 404 0.07
+Iceweasel 0 0.00
+Comodo Dragon 0 0.00
+IE10 404 0.08
+Chrome 200 0.02
+
>>> df.reindex(new_index, fill_value='missing')
+ http_status response_time
+Safari 404 0.07
+Iceweasel missing missing
+Comodo Dragon missing missing
+IE10 404 0.08
+Chrome 200 0.02
+
We can also reindex the columns.
+>>> df.reindex(columns=['http_status', 'user_agent'])
+ http_status user_agent
+Firefox 200 NaN
+Chrome 200 NaN
+Safari 404 NaN
+IE10 404 NaN
+Konqueror 301 NaN
+
Or we can use "axis-style" keyword arguments
+>>> df.reindex(['http_status', 'user_agent'], axis="columns")
+ http_status user_agent
+Firefox 200 NaN
+Chrome 200 NaN
+Safari 404 NaN
+IE10 404 NaN
+Konqueror 301 NaN
+
To further illustrate the filling functionality in
+reindex
, we will create a dataframe with a
+monotonically increasing index (for example, a sequence
+of dates).
>>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
+>>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
+... index=date_index)
+>>> df2
+ prices
+2010-01-01 100.0
+2010-01-02 101.0
+2010-01-03 NaN
+2010-01-04 100.0
+2010-01-05 89.0
+2010-01-06 88.0
+
Suppose we decide to expand the dataframe to cover a wider +date range.
+>>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
+>>> df2.reindex(date_index2)
+ prices
+2009-12-29 NaN
+2009-12-30 NaN
+2009-12-31 NaN
+2010-01-01 100.0
+2010-01-02 101.0
+2010-01-03 NaN
+2010-01-04 100.0
+2010-01-05 89.0
+2010-01-06 88.0
+2010-01-07 NaN
+
The index entries that did not have a value in the original data frame
+(for example, '2009-12-29') are by default filled with NaN
.
+If desired, we can fill in the missing values using one of several
+options.
For example, to back-propagate the last valid value to fill the NaN
+values, pass bfill
as an argument to the method
keyword.
>>> df2.reindex(date_index2, method='bfill')
+ prices
+2009-12-29 100.0
+2009-12-30 100.0
+2009-12-31 100.0
+2010-01-01 100.0
+2010-01-02 101.0
+2010-01-03 NaN
+2010-01-04 100.0
+2010-01-05 89.0
+2010-01-06 88.0
+2010-01-07 NaN
+
Please note that the NaN
value present in the original dataframe
+(at index value 2010-01-03) will not be filled by any of the
+value propagation schemes. This is because filling while reindexing
+does not look at dataframe values, but only compares the original and
+desired indexes. If you do want to fill in the NaN
values present
+in the original dataframe, use the fillna()
method.
See the :ref:user guide <basics.reindexing>
for more.
drop
(
labels=None
, axis=0
, index=None
, columns=None
, level=None
, inplace=False
, errors='raise'
)
Drop specified labels from rows or columns.
Remove rows or columns by specifying label names and corresponding
+axis, or by directly specifying index or column names. When using a
+multi-index, labels on different levels can be removed by specifying
+the level. See the :ref:user guide <advanced.shown_levels>
+for more information about the now unused levels.
labels
+(single label or list-like)
+— Index or column labels to drop. A tuple will be used as a singlelabel and not treated as a list-like.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Whether to drop labels from the index (0 or 'index') orcolumns (1 or 'columns').
+index
+(single label or list-like)
+— Alternative to specifying axis (labels, axis=0
is equivalent to index=labels
).
+columns
+(single label or list-like)
+— Alternative to specifying axis (labels, axis=1
is equivalent to columns=labels
).
+level
+(int or level name, optional)
+— For MultiIndex, level from which the labels will be removed.inplace
+(bool, default False)
+— If False, return a copy. Otherwise, do operationin place and return None.
+errors
+({'ignore', 'raise'}, default 'raise')
+— If 'ignore', suppress error and only existing labels aredropped.
+Returns DataFrame or None DataFrame with the specifiedindex or column labels removed or None if inplace=True.
+KeyError
+
+— If any of the labels is not found in the selected axis.DataFrame.loc : Label-location based indexer for selection by label.DataFrame.dropna : Return DataFrame with labels on given axis omitted + where (all or any) data are missing. +DataFrame.dropduplicates : Return DataFrame with duplicate rows + removed, optionally only considering certain columns. +Series.drop : Return Series with specified index labels removed.
+>>> df = pd.DataFrame(np.arange(12).reshape(3, 4),... columns=['A', 'B', 'C', 'D'])
+>>> df
+ A B C D
+0 0 1 2 3
+1 4 5 6 7
+2 8 9 10 11
+
Drop columns
+>>> df.drop(['B', 'C'], axis=1)
+ A D
+0 0 3
+1 4 7
+2 8 11
+
>>> df.drop(columns=['B', 'C'])
+ A D
+0 0 3
+1 4 7
+2 8 11
+
Drop a row by index
+>>> df.drop([0, 1])
+ A B C D
+2 8 9 10 11
+
Drop columns and/or rows of MultiIndex DataFrame
+>>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'],
+... ['speed', 'weight', 'length']],
+... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
+... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
+>>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
+... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
+... [250, 150], [1.5, 0.8], [320, 250],
+... [1, 0.8], [0.3, 0.2]])
+>>> df
+ big small
+llama speed 45.0 30.0
+ weight 200.0 100.0
+ length 1.5 1.0
+cow speed 30.0 20.0
+ weight 250.0 150.0
+ length 1.5 0.8
+falcon speed 320.0 250.0
+ weight 1.0 0.8
+ length 0.3 0.2
+
Drop a specific index combination from the MultiIndex
+DataFrame, i.e., drop the combination 'falcon'
and
+'weight'
, which deletes only the corresponding row
>>> df.drop(index=('falcon', 'weight'))
+ big small
+llama speed 45.0 30.0
+ weight 200.0 100.0
+ length 1.5 1.0
+cow speed 30.0 20.0
+ weight 250.0 150.0
+ length 1.5 0.8
+falcon speed 320.0 250.0
+ length 0.3 0.2
+
>>> df.drop(index='cow', columns='small')
+ big
+llama speed 45.0
+ weight 200.0
+ length 1.5
+falcon speed 320.0
+ weight 1.0
+ length 0.3
+
>>> df.drop(index='length', level=1)
+ big small
+llama speed 45.0 30.0
+ weight 200.0 100.0
+cow speed 30.0 20.0
+ weight 250.0 150.0
+falcon speed 320.0 250.0
+ weight 1.0 0.8
+
rename
(
mapper=None
, index=None
, columns=None
, axis=None
, copy=None
, inplace=False
, level=None
, errors='ignore'
)
Rename columns or index labels.
Function / dict values must be unique (1-to-1). Labels not contained in +a dict / Series will be left as-is. Extra labels listed don't throw an +error.
+See the :ref:user guide <basics.rename>
for more.
mapper
+(dict-like or function)
+— Dict-like or function transformations to apply tothat axis' values. Use either mapper
and axis
to
+specify the axis to target with mapper
, or index
and
+columns
.
+index
+(dict-like or function)
+— Alternative to specifying axis (mapper, axis=0
is equivalent to index=mapper
).
+columns
+(dict-like or function)
+— Alternative to specifying axis (mapper, axis=1
is equivalent to columns=mapper
).
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Axis to target with mapper
. Can be either the axis name('index', 'columns') or number (0, 1). The default is 'index'.
+copy
+(bool, default True)
+— Also copy underlying data.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+inplace
+(bool, default False)
+— Whether to modify the DataFrame rather than creating a new one.If True then value of copy is ignored.
+level
+(int or level name, default None)
+— In case of a MultiIndex, only rename labels in the specifiedlevel.
+errors
+({'ignore', 'raise'}, default 'ignore')
+— If 'raise', raise a KeyError
when a dict-like mapper
, index
,or columns
contains labels that are not present in the Index
+being transformed.
+If 'ignore', existing keys will be renamed and extra keys will be
+ignored.
+DataFrame with the renamed axis labels or None if inplace=True
.
KeyError
+
+— If any of the labels is not found in the selected axis and"errors='raise'".
+DataFrame.renameaxis : Set the name of the axis.
DataFrame.rename
supports two calling conventions
(index=index_mapper, columns=columns_mapper, ...)
(mapper, axis={'index', 'columns'}, ...)
We highly recommend using keyword arguments to clarify your +intent.
+Rename columns using a mapping:
+>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+>>> df.rename(columns={"A": "a", "B": "c"})
+ a c
+0 1 4
+1 2 5
+2 3 6
+
Rename index using a mapping:
+>>> df.rename(index={0: "x", 1: "y", 2: "z"})
+ A B
+x 1 4
+y 2 5
+z 3 6
+
Cast index labels to a different type:
+>>> df.index
+RangeIndex(start=0, stop=3, step=1)
+>>> df.rename(index=str).index
+Index(['0', '1', '2'], dtype='object')
+
>>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
+Traceback (most recent call last):
+KeyError: ['C'] not found in axis
+
Using axis-style parameters:
+>>> df.rename(str.lower, axis='columns')
+ a b
+0 1 4
+1 2 5
+2 3 6
+
>>> df.rename({1: 2, 2: 4}, axis='index')
+ A B
+0 1 4
+2 2 5
+4 3 6
+
pop
(
item
)
Return item and drop from frame. Raise KeyError if not found.
item
+(label)
+— Label of column to be popped.>>> df = pd.DataFrame([('falcon', 'bird', 389.0),... ('parrot', 'bird', 24.0),
+... ('lion', 'mammal', 80.5),
+... ('monkey', 'mammal', np.nan)],
+... columns=('name', 'class', 'max_speed'))
+>>> df
+ name class max_speed
+0 falcon bird 389.0
+1 parrot bird 24.0
+2 lion mammal 80.5
+3 monkey mammal NaN
+
>>> df.pop('class')
+0 bird
+1 bird
+2 mammal
+3 mammal
+Name: class, dtype: object
+
>>> df
+ name max_speed
+0 falcon 389.0
+1 parrot 24.0
+2 lion 80.5
+3 monkey NaN
+
shift
(
periods=1
, freq=None
, axis=0
, fill_value=<no_default>
, suffix=None
)
Shift index by desired number of periods with an optional time freq
.
When freq
is not passed, shift the index without realigning the data.
+If freq
is passed (in this case, the index must be date or datetime,
+or it will raise a NotImplementedError
), the index will be
+increased using the periods and the freq
. freq
can be inferred
+when specified as "infer" as long as either freq or inferred_freq
+attribute is set in the index.
periods
+(int or Sequence)
+— Number of periods to shift. Can be positive or negative.If an iterable of ints, the data will be shifted once by each int.
+This is equivalent to shifting by one value at a time and
+concatenating all resulting frames. The resulting columns will have
+the shift suffixed to their column names. For multiple periods,
+axis must not be 1.
+freq
+(DateOffset, tseries.offsets, timedelta, or str, optional)
+— Offset to use from the tseries module or time rule (e.g. 'EOM').If freq
is specified then the index values are shifted but the
+data is not realigned. That is, use freq
if you would like to
+extend the index when shifting and preserve the original data.
+If freq
is specified as "infer" then it will be inferred from
+the freq or inferred_freq attributes of the index. If neither of
+those attributes exist, a ValueError is thrown.
+axis
+({0 or 'index', 1 or 'columns', None}, default None)
+— Shift direction. For Series
this parameter is unused and defaults to 0.fill_value
+(object, optional)
+— The scalar value to use for newly introduced missing values.the default depends on the dtype of self
.
+For numeric data, np.nan
is used.
+For datetime, timedelta, or period data, etc. :attr:NaT
is used.
+For extension dtypes, self.dtype.na_value
is used.
+suffix
+(str, optional)
+— If str and periods is an iterable, this is added after the columnname and before the shift value for each shifted column name.
+Copy of input object, shifted.
Index.shift : Shift values of Index.DatetimeIndex.shift : Shift values of DatetimeIndex. +PeriodIndex.shift : Shift values of PeriodIndex.
+>>> df = pd.DataFrame({"Col1": [10, 20, 15, 30, 45],... "Col2": [13, 23, 18, 33, 48],
+... "Col3": [17, 27, 22, 37, 52]},
+... index=pd.date_range("2020-01-01", "2020-01-05"))
+>>> df
+ Col1 Col2 Col3
+2020-01-01 10 13 17
+2020-01-02 20 23 27
+2020-01-03 15 18 22
+2020-01-04 30 33 37
+2020-01-05 45 48 52
+
>>> df.shift(periods=3)
+ Col1 Col2 Col3
+2020-01-01 NaN NaN NaN
+2020-01-02 NaN NaN NaN
+2020-01-03 NaN NaN NaN
+2020-01-04 10.0 13.0 17.0
+2020-01-05 20.0 23.0 27.0
+
>>> df.shift(periods=1, axis="columns")
+ Col1 Col2 Col3
+2020-01-01 NaN 10 13
+2020-01-02 NaN 20 23
+2020-01-03 NaN 15 18
+2020-01-04 NaN 30 33
+2020-01-05 NaN 45 48
+
>>> df.shift(periods=3, fill_value=0)
+ Col1 Col2 Col3
+2020-01-01 0 0 0
+2020-01-02 0 0 0
+2020-01-03 0 0 0
+2020-01-04 10 13 17
+2020-01-05 20 23 27
+
>>> df.shift(periods=3, freq="D")
+ Col1 Col2 Col3
+2020-01-04 10 13 17
+2020-01-05 20 23 27
+2020-01-06 15 18 22
+2020-01-07 30 33 37
+2020-01-08 45 48 52
+
>>> df.shift(periods=3, freq="infer")
+ Col1 Col2 Col3
+2020-01-04 10 13 17
+2020-01-05 20 23 27
+2020-01-06 15 18 22
+2020-01-07 30 33 37
+2020-01-08 45 48 52
+
>>> df['Col1'].shift(periods=[0, 1, 2])
+ Col1_0 Col1_1 Col1_2
+2020-01-01 10 NaN NaN
+2020-01-02 20 10.0 NaN
+2020-01-03 15 20.0 10.0
+2020-01-04 30 15.0 20.0
+2020-01-05 45 30.0 15.0
+
set_index
(
keys
, drop=True
, append=False
, inplace=False
, verify_integrity=False
)
Set the DataFrame index using existing columns.
Set the DataFrame index (row labels) using one or more existing +columns or arrays (of the correct length). The index can replace the +existing index or expand on it.
+keys
+(label or array-like or list of labels/arrays)
+— This parameter can be either a single column key, a single array ofthe same length as the calling DataFrame, or a list containing an
+arbitrary combination of column keys and arrays. Here, "array"
+encompasses :class:Series
, :class:Index
, np.ndarray
, and
+instances of :class:~collections.abc.Iterator
.
+drop
+(bool, default True)
+— Delete columns to be used as the new index.append
+(bool, default False)
+— Whether to append columns to existing index.inplace
+(bool, default False)
+— Whether to modify the DataFrame rather than creating a new one.verify_integrity
+(bool, default False)
+— Check the new index for duplicates. Otherwise defer the check untilnecessary. Setting to False will improve the performance of this
+method.
+Changed row labels or None if inplace=True
.
DataFrame.reset_index : Opposite of set_index.DataFrame.reindex : Change to new indices or expand indices. +DataFrame.reindexlike : Change to same indices as other DataFrame.
+>>> df = pd.DataFrame({'month': [1, 4, 7, 10],... 'year': [2012, 2014, 2013, 2014],
+... 'sale': [55, 40, 84, 31]})
+>>> df
+ month year sale
+0 1 2012 55
+1 4 2014 40
+2 7 2013 84
+3 10 2014 31
+
Set the index to become the 'month' column:
+>>> df.set_index('month')
+ year sale
+month
+1 2012 55
+4 2014 40
+7 2013 84
+10 2014 31
+
Create a MultiIndex using columns 'year' and 'month':
+>>> df.set_index(['year', 'month'])
+ sale
+year month
+2012 1 55
+2014 4 40
+2013 7 84
+2014 10 31
+
Create a MultiIndex using an Index and a column:
+>>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
+ month sale
+ year
+1 2012 1 55
+2 2014 4 40
+3 2013 7 84
+4 2014 10 31
+
Create a MultiIndex using two Series:
+>>> s = pd.Series([1, 2, 3, 4])
+>>> df.set_index([s, s**2])
+ month year sale
+1 1 1 2012 55
+2 4 4 2014 40
+3 9 7 2013 84
+4 16 10 2014 31
+
reset_index
(
level=None
, drop=False
, inplace=False
, col_level=0
, col_fill=''
, allow_duplicates=<no_default>
, names=None
)
Reset the index, or a level of it.
Reset the index of the DataFrame, and use the default one instead. +If the DataFrame has a MultiIndex, this method can remove one or more +levels.
+level
+(int, str, tuple, or list, default None)
+— Only remove the given levels from the index. Removes all levels bydefault.
+drop
+(bool, default False)
+— Do not try to insert index into dataframe columns. This resetsthe index to the default integer index.
+inplace
+(bool, default False)
+— Whether to modify the DataFrame rather than creating a new one.col_level
+(int or str, default 0)
+— If the columns have multiple levels, determines which level thelabels are inserted into. By default it is inserted into the first
+level.
+col_fill
+(object, default '')
+— If the columns have multiple levels, determines how the otherlevels are named. If None then the index name is repeated.
+allow_duplicates
+(bool, optional, default lib.no_default)
+— Allow duplicate column labels to be created.names
+(int, str or 1-dimensional list, default None)
+— Using the given string, rename the DataFrame column which contains theindex data. If the DataFrame has a MultiIndex, this has to be a list or
+tuple with length equal to the number of levels.DataFrame with the new index or None if inplace=True
.
DataFrame.set_index : Opposite of reset_index.DataFrame.reindex : Change to new indices or expand indices. +DataFrame.reindexlike : Change to same indices as other DataFrame.
+>>> df = pd.DataFrame([('bird', 389.0),... ('bird', 24.0),
+... ('mammal', 80.5),
+... ('mammal', np.nan)],
+... index=['falcon', 'parrot', 'lion', 'monkey'],
+... columns=('class', 'max_speed'))
+>>> df
+ class max_speed
+falcon bird 389.0
+parrot bird 24.0
+lion mammal 80.5
+monkey mammal NaN
+
When we reset the index, the old index is added as a column, and a +new sequential index is used:
+>>> df.reset_index()
+ index class max_speed
+0 falcon bird 389.0
+1 parrot bird 24.0
+2 lion mammal 80.5
+3 monkey mammal NaN
+
We can use the drop
parameter to avoid the old index being added as
+a column:
>>> df.reset_index(drop=True)
+ class max_speed
+0 bird 389.0
+1 bird 24.0
+2 mammal 80.5
+3 mammal NaN
+
You can also use reset_index
with MultiIndex
.
>>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
+... ('bird', 'parrot'),
+... ('mammal', 'lion'),
+... ('mammal', 'monkey')],
+... names=['class', 'name'])
+>>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
+... ('species', 'type')])
+>>> df = pd.DataFrame([(389.0, 'fly'),
+... (24.0, 'fly'),
+... (80.5, 'run'),
+... (np.nan, 'jump')],
+... index=index,
+... columns=columns)
+>>> df
+ speed species
+ max type
+class name
+bird falcon 389.0 fly
+ parrot 24.0 fly
+mammal lion 80.5 run
+ monkey NaN jump
+
Using the names
parameter, choose a name for the index column:
>>> df.reset_index(names=['classes', 'names'])
+ classes names speed species
+ max type
+0 bird falcon 389.0 fly
+1 bird parrot 24.0 fly
+2 mammal lion 80.5 run
+3 mammal monkey NaN jump
+
If the index has multiple levels, we can reset a subset of them:
+>>> df.reset_index(level='class')
+ class speed species
+ max type
+name
+falcon bird 389.0 fly
+parrot bird 24.0 fly
+lion mammal 80.5 run
+monkey mammal NaN jump
+
If we are not dropping the index, by default, it is placed in the top +level. We can place it in another level:
+>>> df.reset_index(level='class', col_level=1)
+ speed species
+ class max type
+name
+falcon bird 389.0 fly
+parrot bird 24.0 fly
+lion mammal 80.5 run
+monkey mammal NaN jump
+
When the index is inserted under another level, we can specify under
+which one with the parameter col_fill
:
>>> df.reset_index(level='class', col_level=1, col_fill='species')
+ species speed species
+ class max type
+name
+falcon bird 389.0 fly
+parrot bird 24.0 fly
+lion mammal 80.5 run
+monkey mammal NaN jump
+
If we specify a nonexistent level for col_fill
, it is created:
>>> df.reset_index(level='class', col_level=1, col_fill='genus')
+ genus speed species
+ class max type
+name
+falcon bird 389.0 fly
+parrot bird 24.0 fly
+lion mammal 80.5 run
+monkey mammal NaN jump
+
isna
(
)
Detect missing values.
Return a boolean same-sized object indicating if the values are NA.
+NA values, such as None or :attr:numpy.NaN
, gets mapped to True
+values.
+Everything else gets mapped to False values. Characters such as empty
+strings ''
or :attr:numpy.inf
are not considered NA values
+(unless you set pandas.options.mode.use_inf_as_na = True
).
Mask of bool values for each element in DataFrame thatindicates whether an element is an NA value.
+DataFrame.isnull : Alias of isna.DataFrame.notna : Boolean inverse of isna. +DataFrame.dropna : Omit axes labels with missing values. +isna : Top-level isna.
+Show which entries in a DataFrame are NA.
>>> df = pd.DataFrame(dict(age=[5, 6, np.nan],
+... born=[pd.NaT, pd.Timestamp('1939-05-27'),
+... pd.Timestamp('1940-04-25')],
+... name=['Alfred', 'Batman', ''],
+... toy=[None, 'Batmobile', 'Joker']))
+>>> df
+ age born name toy
+0 5.0 NaT Alfred None
+1 6.0 1939-05-27 Batman Batmobile
+2 NaN 1940-04-25 Joker
+
>>> df.isna()
+ age born name toy
+0 False True False True
+1 False False False False
+2 True False False False
+
Show which entries in a Series are NA.
+>>> ser = pd.Series([5, 6, np.nan])
+>>> ser
+0 5.0
+1 6.0
+2 NaN
+dtype: float64
+
>>> ser.isna()
+0 False
+1 False
+2 True
+dtype: bool
+
isnull
(
)
DataFrame.isnull is an alias for DataFrame.isna.
Detect missing values.
+Return a boolean same-sized object indicating if the values are NA.
+NA values, such as None or :attr:numpy.NaN
, gets mapped to True
+values.
+Everything else gets mapped to False values. Characters such as empty
+strings ''
or :attr:numpy.inf
are not considered NA values
+(unless you set pandas.options.mode.use_inf_as_na = True
).
Mask of bool values for each element in DataFrame thatindicates whether an element is an NA value.
+DataFrame.isnull : Alias of isna.DataFrame.notna : Boolean inverse of isna. +DataFrame.dropna : Omit axes labels with missing values. +isna : Top-level isna.
+Show which entries in a DataFrame are NA.
>>> df = pd.DataFrame(dict(age=[5, 6, np.nan],
+... born=[pd.NaT, pd.Timestamp('1939-05-27'),
+... pd.Timestamp('1940-04-25')],
+... name=['Alfred', 'Batman', ''],
+... toy=[None, 'Batmobile', 'Joker']))
+>>> df
+ age born name toy
+0 5.0 NaT Alfred None
+1 6.0 1939-05-27 Batman Batmobile
+2 NaN 1940-04-25 Joker
+
>>> df.isna()
+ age born name toy
+0 False True False True
+1 False False False False
+2 True False False False
+
Show which entries in a Series are NA.
+>>> ser = pd.Series([5, 6, np.nan])
+>>> ser
+0 5.0
+1 6.0
+2 NaN
+dtype: float64
+
>>> ser.isna()
+0 False
+1 False
+2 True
+dtype: bool
+
notna
(
)
Detect existing (non-missing) values.
Return a boolean same-sized object indicating if the values are not NA.
+Non-missing values get mapped to True. Characters such as empty
+strings ''
or :attr:numpy.inf
are not considered NA values
+(unless you set pandas.options.mode.use_inf_as_na = True
).
+NA values, such as None or :attr:numpy.NaN
, get mapped to False
+values.
Mask of bool values for each element in DataFrame thatindicates whether an element is not an NA value.
+DataFrame.notnull : Alias of notna.DataFrame.isna : Boolean inverse of notna. +DataFrame.dropna : Omit axes labels with missing values. +notna : Top-level notna.
+Show which entries in a DataFrame are not NA.
>>> df = pd.DataFrame(dict(age=[5, 6, np.nan],
+... born=[pd.NaT, pd.Timestamp('1939-05-27'),
+... pd.Timestamp('1940-04-25')],
+... name=['Alfred', 'Batman', ''],
+... toy=[None, 'Batmobile', 'Joker']))
+>>> df
+ age born name toy
+0 5.0 NaT Alfred None
+1 6.0 1939-05-27 Batman Batmobile
+2 NaN 1940-04-25 Joker
+
>>> df.notna()
+ age born name toy
+0 True False True False
+1 True True True True
+2 False True True True
+
Show which entries in a Series are not NA.
+>>> ser = pd.Series([5, 6, np.nan])
+>>> ser
+0 5.0
+1 6.0
+2 NaN
+dtype: float64
+
>>> ser.notna()
+0 True
+1 True
+2 False
+dtype: bool
+
notnull
(
)
DataFrame.notnull is an alias for DataFrame.notna.
Detect existing (non-missing) values.
+Return a boolean same-sized object indicating if the values are not NA.
+Non-missing values get mapped to True. Characters such as empty
+strings ''
or :attr:numpy.inf
are not considered NA values
+(unless you set pandas.options.mode.use_inf_as_na = True
).
+NA values, such as None or :attr:numpy.NaN
, get mapped to False
+values.
Mask of bool values for each element in DataFrame thatindicates whether an element is not an NA value.
+DataFrame.notnull : Alias of notna.DataFrame.isna : Boolean inverse of notna. +DataFrame.dropna : Omit axes labels with missing values. +notna : Top-level notna.
+Show which entries in a DataFrame are not NA.
>>> df = pd.DataFrame(dict(age=[5, 6, np.nan],
+... born=[pd.NaT, pd.Timestamp('1939-05-27'),
+... pd.Timestamp('1940-04-25')],
+... name=['Alfred', 'Batman', ''],
+... toy=[None, 'Batmobile', 'Joker']))
+>>> df
+ age born name toy
+0 5.0 NaT Alfred None
+1 6.0 1939-05-27 Batman Batmobile
+2 NaN 1940-04-25 Joker
+
>>> df.notna()
+ age born name toy
+0 True False True False
+1 True True True True
+2 False True True True
+
Show which entries in a Series are not NA.
+>>> ser = pd.Series([5, 6, np.nan])
+>>> ser
+0 5.0
+1 6.0
+2 NaN
+dtype: float64
+
>>> ser.notna()
+0 True
+1 True
+2 False
+dtype: bool
+
dropna
(
axis=0
, how=<no_default>
, thresh=<no_default>
, subset=None
, inplace=False
, ignore_index=False
)
Remove missing values.
See the :ref:User Guide <missing_data>
for more on which values are
+considered missing, and how to work with missing data.
axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Determine if rows or columns which contain missing values areremoved.how
+({'any', 'all'}, default 'any')
+— Determine if row or column is removed from DataFrame, when we haveat least one NA or all NA.thresh
+(int, optional)
+— Require that many non-NA values. Cannot be combined with how.subset
+(column label or sequence of labels, optional)
+— Labels along other axis to consider, e.g. if you are dropping rowsthese would be a list of columns to include.
+inplace
+(bool, default False)
+— Whether to modify the DataFrame rather than creating a new one.ignore_index
+(bool, default ``False``)
+— If True
, the resulting axis will be labeled 0, 1, …, n - 1.DataFrame with NA entries dropped from it or None if inplace=True
.
DataFrame.isna: Indicate missing values.DataFrame.notna : Indicate existing (non-missing) values. +DataFrame.fillna : Replace missing values. +Series.dropna : Drop missing values. +Index.dropna : Drop missing indices.
+>>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
+... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
+... pd.NaT]})
+>>> df
+ name toy born
+0 Alfred NaN NaT
+1 Batman Batmobile 1940-04-25
+2 Catwoman Bullwhip NaT
+
Drop the rows where at least one element is missing.
+>>> df.dropna()
+ name toy born
+1 Batman Batmobile 1940-04-25
+
Drop the columns where at least one element is missing.
+>>> df.dropna(axis='columns')
+ name
+0 Alfred
+1 Batman
+2 Catwoman
+
Drop the rows where all elements are missing.
+>>> df.dropna(how='all')
+ name toy born
+0 Alfred NaN NaT
+1 Batman Batmobile 1940-04-25
+2 Catwoman Bullwhip NaT
+
Keep only the rows with at least 2 non-NA values.
+>>> df.dropna(thresh=2)
+ name toy born
+1 Batman Batmobile 1940-04-25
+2 Catwoman Bullwhip NaT
+
Define in which columns to look for missing values.
+>>> df.dropna(subset=['name', 'toy'])
+ name toy born
+1 Batman Batmobile 1940-04-25
+2 Catwoman Bullwhip NaT
+
drop_duplicates
(
subset=None
, keep='first'
, inplace=False
, ignore_index=False
)
Return DataFrame with duplicate rows removed.
Considering certain columns is optional. Indexes, including time indexes +are ignored.
+subset
+(column label or sequence of labels, optional)
+— Only consider certain columns for identifying duplicates, bydefault use all of the columns.
+keep
+({'first', 'last', ``False``}, default 'first')
+— Determines which duplicates (if any) to keep.False
: Drop all duplicates.inplace
+(bool, default ``False``)
+— Whether to modify the DataFrame rather than creating a new one.ignore_index
+(bool, default ``False``)
+— If True
, the resulting axis will be labeled 0, 1, …, n - 1.DataFrame with duplicates removed or None if inplace=True
.
DataFrame.value_counts: Count unique combinations of columns.
Consider dataset containing ramen rating.
>>> df = pd.DataFrame({
+... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
+... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
+... 'rating': [4, 4, 3.5, 15, 5]
+... })
+>>> df
+ brand style rating
+0 Yum Yum cup 4.0
+1 Yum Yum cup 4.0
+2 Indomie cup 3.5
+3 Indomie pack 15.0
+4 Indomie pack 5.0
+
By default, it removes duplicate rows based on all columns.
+>>> df.drop_duplicates()
+ brand style rating
+0 Yum Yum cup 4.0
+2 Indomie cup 3.5
+3 Indomie pack 15.0
+4 Indomie pack 5.0
+
To remove duplicates on specific column(s), use subset
.
>>> df.drop_duplicates(subset=['brand'])
+ brand style rating
+0 Yum Yum cup 4.0
+2 Indomie cup 3.5
+
To remove duplicates and keep last occurrences, use keep
.
>>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
+ brand style rating
+1 Yum Yum cup 4.0
+2 Indomie cup 3.5
+4 Indomie pack 5.0
+
duplicated
(
subset=None
, keep='first'
)
Return boolean Series denoting duplicate rows.
Considering certain columns is optional.
+subset
+(column label or sequence of labels, optional)
+— Only consider certain columns for identifying duplicates, bydefault use all of the columns.
+keep
+({'first', 'last', False}, default 'first')
+— Determines which duplicates (if any) to mark.first
: Mark duplicates as True
except for the first occurrence.last
: Mark duplicates as True
except for the last occurrence.True
.Boolean series for each duplicated rows.
Index.duplicated : Equivalent method on index.Series.duplicated : Equivalent method on Series. +Series.dropduplicates : Remove duplicate values from Series. +DataFrame.dropduplicates : Remove duplicate values from DataFrame.
+Consider dataset containing ramen rating.
>>> df = pd.DataFrame({
+... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
+... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
+... 'rating': [4, 4, 3.5, 15, 5]
+... })
+>>> df
+ brand style rating
+0 Yum Yum cup 4.0
+1 Yum Yum cup 4.0
+2 Indomie cup 3.5
+3 Indomie pack 15.0
+4 Indomie pack 5.0
+
By default, for each set of duplicated values, the first occurrence +is set on False and all others on True.
+>>> df.duplicated()
+0 False
+1 True
+2 False
+3 False
+4 False
+dtype: bool
+
By using 'last', the last occurrence of each set of duplicated values +is set on False and all others on True.
+>>> df.duplicated(keep='last')
+0 True
+1 False
+2 False
+3 False
+4 False
+dtype: bool
+
By setting keep
on False, all duplicates are True.
>>> df.duplicated(keep=False)
+0 True
+1 True
+2 False
+3 False
+4 False
+dtype: bool
+
To find duplicates on specific column(s), use subset
.
>>> df.duplicated(subset=['brand'])
+0 False
+1 True
+2 False
+3 True
+4 True
+dtype: bool
+
sort_values
(
by
, axis=0
, ascending=True
, inplace=False
, kind='quicksort'
, na_position='last'
, ignore_index=False
, key=None
)
Sort by the values along either axis.
by
+(str or list of str)
+— Name or list of names to sort by.axis
is 0 or 'index'
then by
may contain index
+ levels and/or column labels.axis
is 1 or 'columns'
then by
may contain column
+ levels and/or index labels.axis
+("{0 or 'index', 1 or 'columns'}", default 0)
+— Axis to be sorted.ascending
+(bool or list of bool, default True)
+— Sort ascending vs. descending. Specify list for multiple sortorders. If this is a list of bools, must match the length of
+the by.
+inplace
+(bool, default False)
+— If True, perform operation in-place.kind
+({'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort')
+— Choice of sorting algorithm. See also :func:numpy.sort
for moreinformation. mergesort
and stable
are the only stable algorithms. For
+DataFrames, this option is only applied when sorting on a single
+column or label.
+na_position
+({'first', 'last'}, default 'last')
+— Puts NaNs at the beginning if first
; last
puts NaNs at theend.
+ignore_index
+(bool, default False)
+— If True, the resulting axis will be labeled 0, 1, …, n - 1.key
+(callable, optional)
+— Apply the key function to the valuesbefore sorting. This is similar to the key
argument in the
+builtin :meth:sorted
function, with the notable difference that
+this key
function should be vectorized. It should expect a
+Series
and return a Series with the same shape as the input.
+It will be applied to each column in by
independently.
+DataFrame with sorted values or None if inplace=True
.
DataFrame.sort_index : Sort a DataFrame by the index.Series.sort_values : Similar method for a Series.
+>>> df = pd.DataFrame({... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
+... 'col2': [2, 1, 9, 8, 7, 4],
+... 'col3': [0, 1, 9, 4, 2, 3],
+... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']
+... })
+>>> df
+ col1 col2 col3 col4
+0 A 2 0 a
+1 A 1 1 B
+2 B 9 9 c
+3 NaN 8 4 D
+4 D 7 2 e
+5 C 4 3 F
+
Sort by col1
+>>> df.sort_values(by=['col1'])
+ col1 col2 col3 col4
+0 A 2 0 a
+1 A 1 1 B
+2 B 9 9 c
+5 C 4 3 F
+4 D 7 2 e
+3 NaN 8 4 D
+
Sort by multiple columns
+>>> df.sort_values(by=['col1', 'col2'])
+ col1 col2 col3 col4
+1 A 1 1 B
+0 A 2 0 a
+2 B 9 9 c
+5 C 4 3 F
+4 D 7 2 e
+3 NaN 8 4 D
+
Sort Descending
+>>> df.sort_values(by='col1', ascending=False)
+ col1 col2 col3 col4
+4 D 7 2 e
+5 C 4 3 F
+2 B 9 9 c
+0 A 2 0 a
+1 A 1 1 B
+3 NaN 8 4 D
+
Putting NAs first
+>>> df.sort_values(by='col1', ascending=False, na_position='first')
+ col1 col2 col3 col4
+3 NaN 8 4 D
+4 D 7 2 e
+5 C 4 3 F
+2 B 9 9 c
+0 A 2 0 a
+1 A 1 1 B
+
Sorting with a key function
+>>> df.sort_values(by='col4', key=lambda col: col.str.lower())
+ col1 col2 col3 col4
+0 A 2 0 a
+1 A 1 1 B
+2 B 9 9 c
+3 NaN 8 4 D
+4 D 7 2 e
+5 C 4 3 F
+
Natural sort with the key argument,
+using the natsort <https://github.com/SethMMorton/natsort>
package.
>>> df = pd.DataFrame({
+... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],
+... "value": [10, 20, 30, 40, 50]
+... })
+>>> df
+ time value
+0 0hr 10
+1 128hr 20
+2 72hr 30
+3 48hr 40
+4 96hr 50
+>>> from natsort import index_natsorted
+>>> df.sort_values(
+... by="time",
+... key=lambda x: np.argsort(index_natsorted(df["time"]))
+... )
+ time value
+0 0hr 10
+3 48hr 40
+2 72hr 30
+4 96hr 50
+1 128hr 20
+
sort_index
(
axis=0
, level=None
, ascending=True
, inplace=False
, kind='quicksort'
, na_position='last'
, sort_remaining=True
, ignore_index=False
, key=None
)
Sort object by labels (along an axis).
Returns a new DataFrame sorted by label if inplace
argument is
+False
, otherwise updates the original DataFrame and returns None.
axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis along which to sort. The value 0 identifies the rows,and 1 identifies the columns.
+level
+(int or level name or list of ints or list of level names)
+— If not None, sort on values in specified index level(s).ascending
+(bool or list-like of bools, default True)
+— Sort ascending vs. descending. When the index is a MultiIndex thesort direction can be controlled for each level individually.
+inplace
+(bool, default False)
+— Whether to modify the DataFrame rather than creating a new one.kind
+({'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort')
+— Choice of sorting algorithm. See also :func:numpy.sort
for moreinformation. mergesort
and stable
are the only stable algorithms. For
+DataFrames, this option is only applied when sorting on a single
+column or label.
+na_position
+({'first', 'last'}, default 'last')
+— Puts NaNs at the beginning if first
; last
puts NaNs at the end.Not implemented for MultiIndex.
+sort_remaining
+(bool, default True)
+— If True and sorting by level and index is multilevel, sort by otherlevels too (in order) after sorting by specified level.
+ignore_index
+(bool, default False)
+— If True, the resulting axis will be labeled 0, 1, …, n - 1.key
+(callable, optional)
+— If not None, apply the key function to the index valuesbefore sorting. This is similar to the key
argument in the
+builtin :meth:sorted
function, with the notable difference that
+this key
function should be vectorized. It should expect an
+Index
and return an Index
of the same shape. For MultiIndex
+inputs, the key is applied per level.
+The original DataFrame sorted by the labels or None if inplace=True
.
Series.sort_index : Sort Series by the index.DataFrame.sort_values : Sort DataFrame by the value. +Series.sort_values : Sort Series by the value.
+>>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],... columns=['A'])
+>>> df.sort_index()
+ A
+1 4
+29 2
+100 1
+150 5
+234 3
+
By default, it sorts in ascending order, to sort in descending order,
+use ascending=False
>>> df.sort_index(ascending=False)
+ A
+234 3
+150 5
+100 1
+29 2
+1 4
+
A key function can be specified which is applied to the index before
+sorting. For a MultiIndex
this is applied to each level separately.
>>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
+>>> df.sort_index(key=lambda x: x.str.lower())
+ a
+A 1
+b 2
+C 3
+d 4
+
value_counts
(
subset=None
, normalize=False
, sort=True
, ascending=False
, dropna=True
)
Return a Series containing the frequency of each distinct row in the Dataframe.
subset
+(label or list of labels, optional)
+— Columns to use when counting unique combinations.normalize
+(bool, default False)
+— Return proportions rather than frequencies.sort
+(bool, default True)
+— Sort by frequencies when True. Sort by DataFrame column values when False.ascending
+(bool, default False)
+— Sort in ascending order.dropna
+(bool, default True)
+— Don't include counts of rows that contain NA values.Series.value_counts: Equivalent method on Series.
Notes
+The returned Series will have a MultiIndex with one level per input +column but an Index (non-multi) for a single label. By default, rows +that contain any NA values are omitted from the result. By default, +the resulting Series will be in descending order so that the first +element is the most frequently-occurring row.
+>>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],... 'num_wings': [2, 0, 0, 0]},
+... index=['falcon', 'dog', 'cat', 'ant'])
+>>> df
+ num_legs num_wings
+falcon 2 2
+dog 4 0
+cat 4 0
+ant 6 0
+
>>> df.value_counts()
+num_legs num_wings
+4 0 2
+2 2 1
+6 0 1
+Name: count, dtype: int64
+
>>> df.value_counts(sort=False)
+num_legs num_wings
+2 2 1
+4 0 2
+6 0 1
+Name: count, dtype: int64
+
>>> df.value_counts(ascending=True)
+num_legs num_wings
+2 2 1
+6 0 1
+4 0 2
+Name: count, dtype: int64
+
>>> df.value_counts(normalize=True)
+num_legs num_wings
+4 0 0.50
+2 2 0.25
+6 0 0.25
+Name: proportion, dtype: float64
+
With dropna
set to False
we can also count rows with NA values.
>>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
+... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
+>>> df
+ first_name middle_name
+0 John Smith
+1 Anne <NA>
+2 John <NA>
+3 Beth Louise
+
>>> df.value_counts()
+first_name middle_name
+Beth Louise 1
+John Smith 1
+Name: count, dtype: int64
+
>>> df.value_counts(dropna=False)
+first_name middle_name
+Anne NaN 1
+Beth Louise 1
+John Smith 1
+ NaN 1
+Name: count, dtype: int64
+
>>> df.value_counts("first_name")
+first_name
+John 2
+Anne 1
+Beth 1
+Name: count, dtype: int64
+
nlargest
(
n
, columns
, keep='first'
)
Return the first n
rows ordered by columns
in descending order.
Return the first n
rows with the largest values in columns
, in
+descending order. The columns that are not specified are returned as
+well, but not used for ordering.
This method is equivalent to
+df.sort_values(columns, ascending=False).head(n)
, but more
+performant.
n
+(int)
+— Number of rows to return.columns
+(label or list of labels)
+— Column label(s) to order by.keep
+({'first', 'last', 'all'}, default 'first')
+— Where there are duplicate values:first
: prioritize the first occurrence(s)last
: prioritize the last occurrence(s)all
: keep all the ties of the smallest item even if it means
+ selecting more than n
items.The first n
rows ordered by the given columns in descendingorder.
DataFrame.nsmallest : Return the first n
rows ordered by columns
in ascending order.
+DataFrame.sort_values : Sort DataFrame by the values.
+DataFrame.head : Return the first n
rows without re-ordering.
Notes
+This function cannot be used with all column types. For example, when
+specifying columns with object
or category
dtypes, TypeError
is
+raised.
>>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,... 434000, 434000, 337000, 11300,
+... 11300, 11300],
+... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
+... 17036, 182, 38, 311],
+... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
+... "IS", "NR", "TV", "AI"]},
+... index=["Italy", "France", "Malta",
+... "Maldives", "Brunei", "Iceland",
+... "Nauru", "Tuvalu", "Anguilla"])
+>>> df
+ population GDP alpha-2
+Italy 59000000 1937894 IT
+France 65000000 2583560 FR
+Malta 434000 12011 MT
+Maldives 434000 4520 MV
+Brunei 434000 12128 BN
+Iceland 337000 17036 IS
+Nauru 11300 182 NR
+Tuvalu 11300 38 TV
+Anguilla 11300 311 AI
+
In the following example, we will use nlargest
to select the three
+rows having the largest values in column "population".
>>> df.nlargest(3, 'population')
+ population GDP alpha-2
+France 65000000 2583560 FR
+Italy 59000000 1937894 IT
+Malta 434000 12011 MT
+
When using keep='last'
, ties are resolved in reverse order:
>>> df.nlargest(3, 'population', keep='last')
+ population GDP alpha-2
+France 65000000 2583560 FR
+Italy 59000000 1937894 IT
+Brunei 434000 12128 BN
+
When using keep='all'
, the number of element kept can go beyond n
+if there are duplicate values for the smallest element, all the
+ties are kept:
>>> df.nlargest(3, 'population', keep='all')
+ population GDP alpha-2
+France 65000000 2583560 FR
+Italy 59000000 1937894 IT
+Malta 434000 12011 MT
+Maldives 434000 4520 MV
+Brunei 434000 12128 BN
+
However, nlargest
does not keep n
distinct largest elements:
>>> df.nlargest(5, 'population', keep='all')
+ population GDP alpha-2
+France 65000000 2583560 FR
+Italy 59000000 1937894 IT
+Malta 434000 12011 MT
+Maldives 434000 4520 MV
+Brunei 434000 12128 BN
+
To order by the largest values in column "population" and then "GDP", +we can specify multiple columns like in the next example.
+>>> df.nlargest(3, ['population', 'GDP'])
+ population GDP alpha-2
+France 65000000 2583560 FR
+Italy 59000000 1937894 IT
+Brunei 434000 12128 BN
+
nsmallest
(
n
, columns
, keep='first'
)
Return the first n
rows ordered by columns
in ascending order.
Return the first n
rows with the smallest values in columns
, in
+ascending order. The columns that are not specified are returned as
+well, but not used for ordering.
This method is equivalent to
+df.sort_values(columns, ascending=True).head(n)
, but more
+performant.
n
+(int)
+— Number of items to retrieve.columns
+(list or str)
+— Column name or names to order by.keep
+({'first', 'last', 'all'}, default 'first')
+— Where there are duplicate values:first
: take the first occurrence.last
: take the last occurrence.all
: keep all the ties of the largest item even if it means
+ selecting more than n
items.DataFrame.nlargest : Return the first n
rows ordered by columns
in descending order.
+DataFrame.sort_values : Sort DataFrame by the values.
+DataFrame.head : Return the first n
rows without re-ordering.
>>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,... 434000, 434000, 337000, 337000,
+... 11300, 11300],
+... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
+... 17036, 182, 38, 311],
+... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
+... "IS", "NR", "TV", "AI"]},
+... index=["Italy", "France", "Malta",
+... "Maldives", "Brunei", "Iceland",
+... "Nauru", "Tuvalu", "Anguilla"])
+>>> df
+ population GDP alpha-2
+Italy 59000000 1937894 IT
+France 65000000 2583560 FR
+Malta 434000 12011 MT
+Maldives 434000 4520 MV
+Brunei 434000 12128 BN
+Iceland 337000 17036 IS
+Nauru 337000 182 NR
+Tuvalu 11300 38 TV
+Anguilla 11300 311 AI
+
In the following example, we will use nsmallest
to select the
+three rows having the smallest values in column "population".
>>> df.nsmallest(3, 'population')
+ population GDP alpha-2
+Tuvalu 11300 38 TV
+Anguilla 11300 311 AI
+Iceland 337000 17036 IS
+
When using keep='last'
, ties are resolved in reverse order:
>>> df.nsmallest(3, 'population', keep='last')
+ population GDP alpha-2
+Anguilla 11300 311 AI
+Tuvalu 11300 38 TV
+Nauru 337000 182 NR
+
When using keep='all'
, the number of element kept can go beyond n
+if there are duplicate values for the largest element, all the
+ties are kept.
>>> df.nsmallest(3, 'population', keep='all')
+ population GDP alpha-2
+Tuvalu 11300 38 TV
+Anguilla 11300 311 AI
+Iceland 337000 17036 IS
+Nauru 337000 182 NR
+
However, nsmallest
does not keep n
distinct
+smallest elements:
>>> df.nsmallest(4, 'population', keep='all')
+ population GDP alpha-2
+Tuvalu 11300 38 TV
+Anguilla 11300 311 AI
+Iceland 337000 17036 IS
+Nauru 337000 182 NR
+
To order by the smallest values in column "population" and then "GDP", we can +specify multiple columns like in the next example.
+>>> df.nsmallest(3, ['population', 'GDP'])
+ population GDP alpha-2
+Tuvalu 11300 38 TV
+Anguilla 11300 311 AI
+Nauru 337000 182 NR
+
swaplevel
(
i=-2
, j=-1
, axis=0
)
Swap levels i and j in a :class:MultiIndex
.
Default is to swap the two innermost levels of the index.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to swap levels on. 0 or 'index' for row-wise, 1 or'columns' for column-wise.
+DataFrame with levels swapped in MultiIndex.
>>> df = pd.DataFrame(... {"Grade": ["A", "B", "A", "C"]},
+... index=[
+... ["Final exam", "Final exam", "Coursework", "Coursework"],
+... ["History", "Geography", "History", "Geography"],
+... ["January", "February", "March", "April"],
+... ],
+... )
+>>> df
+ Grade
+Final exam History January A
+ Geography February B
+Coursework History March A
+ Geography April C
+
In the following example, we will swap the levels of the indices. +Here, we will swap the levels column-wise, but levels can be swapped row-wise +in a similar manner. Note that column-wise is the default behaviour. +By not supplying any arguments for i and j, we swap the last and second to +last indices.
+>>> df.swaplevel()
+ Grade
+Final exam January History A
+ February Geography B
+Coursework March History A
+ April Geography C
+
By supplying one argument, we can choose which index to swap the last +index with. We can for example swap the first index with the last one as +follows.
+>>> df.swaplevel(0)
+ Grade
+January History Final exam A
+February Geography Final exam B
+March History Coursework A
+April Geography Coursework C
+
We can also define explicitly which indices we want to swap by supplying values +for both i and j. Here, we for example swap the first and second indices.
+>>> df.swaplevel(0, 1)
+ Grade
+History Final exam January A
+Geography Final exam February B
+History Coursework March A
+Geography Coursework April C
+
reorder_levels
(
order
, axis=0
)
Rearrange index levels using input order. May not drop or duplicate levels.
order
+(list of int or list of str)
+— List representing new level order. Reference level by number(position) or by key (label).
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Where to reorder levels.>>> data = {... "class": ["Mammals", "Mammals", "Reptiles"],
+... "diet": ["Omnivore", "Carnivore", "Carnivore"],
+... "species": ["Humans", "Dogs", "Snakes"],
+... }
+>>> df = pd.DataFrame(data, columns=["class", "diet", "species"])
+>>> df = df.set_index(["class", "diet"])
+>>> df
+ species
+class diet
+Mammals Omnivore Humans
+ Carnivore Dogs
+Reptiles Carnivore Snakes
+
Let's reorder the levels of the index:
+>>> df.reorder_levels(["diet", "class"])
+ species
+diet class
+Omnivore Mammals Humans
+Carnivore Mammals Dogs
+ Reptiles Snakes
+
eq
(
other
, axis='columns'
, level=None
)
Get Equal to of dataframe and other, element-wise (binary operator eq
).
Among flexible wrappers (eq
, ne
, le
, lt
, ge
, gt
) to comparison
+operators.
Equivalent to ==
, !=
, <=
, <
, >=
, >
with support to choose axis
+(rows or columns) and level for comparison.
other
+(scalar, sequence, Series, or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'}, default 'columns')
+— Whether to compare by the index (0 or 'index') or columns(1 or 'columns').
+level
+(int or label)
+— Broadcast across a level, matching Index values on the passedMultiIndex level.
+Result of the comparison.
DataFrame.eq : Compare DataFrames for equality elementwise.DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise.
+Notes
+Mismatched indices will be unioned together.
+NaN
values are considered different (i.e. NaN
!= NaN
).
>>> df = pd.DataFrame({'cost': [250, 150, 100],... 'revenue': [100, 250, 300]},
+... index=['A', 'B', 'C'])
+>>> df
+ cost revenue
+A 250 100
+B 150 250
+C 100 300
+
Comparison with a scalar, using either the operator or method:
+>>> df == 100
+ cost revenue
+A False True
+B False False
+C True False
+
>>> df.eq(100)
+ cost revenue
+A False True
+B False False
+C True False
+
When other
is a :class:Series
, the columns of a DataFrame are aligned
+with the index of other
and broadcast:
>>> df != pd.Series([100, 250], index=["cost", "revenue"])
+ cost revenue
+A True True
+B True False
+C False True
+
Use the method to control the broadcast axis:
+>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index')
+ cost revenue
+A True False
+B True True
+C True True
+D True True
+
When comparing to an arbitrary sequence, the number of columns must
+match the number elements in other
:
>>> df == [250, 100]
+ cost revenue
+A True True
+B False False
+C False False
+
Use the method to control the axis:
+>>> df.eq([250, 250, 100], axis='index')
+ cost revenue
+A True False
+B False True
+C True False
+
Compare to a DataFrame of different shape.
+>>> other = pd.DataFrame({'revenue': [300, 250, 100, 150]},
+... index=['A', 'B', 'C', 'D'])
+>>> other
+ revenue
+A 300
+B 250
+C 100
+D 150
+
>>> df.gt(other)
+ cost revenue
+A False False
+B False False
+C False True
+D False False
+
Compare to a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'cost': [250, 150, 100, 150, 300, 220],
+... 'revenue': [100, 250, 300, 200, 175, 225]},
+... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
+... ['A', 'B', 'C', 'A', 'B', 'C']])
+>>> df_multindex
+ cost revenue
+Q1 A 250 100
+ B 150 250
+ C 100 300
+Q2 A 150 200
+ B 300 175
+ C 220 225
+
>>> df.le(df_multindex, level=1)
+ cost revenue
+Q1 A True True
+ B True True
+ C True True
+Q2 A False True
+ B True False
+ C True False
+
ne
(
other
, axis='columns'
, level=None
)
Get Not equal to of dataframe and other, element-wise (binary operator ne
).
Among flexible wrappers (eq
, ne
, le
, lt
, ge
, gt
) to comparison
+operators.
Equivalent to ==
, !=
, <=
, <
, >=
, >
with support to choose axis
+(rows or columns) and level for comparison.
other
+(scalar, sequence, Series, or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'}, default 'columns')
+— Whether to compare by the index (0 or 'index') or columns(1 or 'columns').
+level
+(int or label)
+— Broadcast across a level, matching Index values on the passedMultiIndex level.
+Result of the comparison.
DataFrame.eq : Compare DataFrames for equality elementwise.DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise.
+Notes
+Mismatched indices will be unioned together.
+NaN
values are considered different (i.e. NaN
!= NaN
).
>>> df = pd.DataFrame({'cost': [250, 150, 100],... 'revenue': [100, 250, 300]},
+... index=['A', 'B', 'C'])
+>>> df
+ cost revenue
+A 250 100
+B 150 250
+C 100 300
+
Comparison with a scalar, using either the operator or method:
+>>> df == 100
+ cost revenue
+A False True
+B False False
+C True False
+
>>> df.eq(100)
+ cost revenue
+A False True
+B False False
+C True False
+
When other
is a :class:Series
, the columns of a DataFrame are aligned
+with the index of other
and broadcast:
>>> df != pd.Series([100, 250], index=["cost", "revenue"])
+ cost revenue
+A True True
+B True False
+C False True
+
Use the method to control the broadcast axis:
+>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index')
+ cost revenue
+A True False
+B True True
+C True True
+D True True
+
When comparing to an arbitrary sequence, the number of columns must
+match the number elements in other
:
>>> df == [250, 100]
+ cost revenue
+A True True
+B False False
+C False False
+
Use the method to control the axis:
+>>> df.eq([250, 250, 100], axis='index')
+ cost revenue
+A True False
+B False True
+C True False
+
Compare to a DataFrame of different shape.
+>>> other = pd.DataFrame({'revenue': [300, 250, 100, 150]},
+... index=['A', 'B', 'C', 'D'])
+>>> other
+ revenue
+A 300
+B 250
+C 100
+D 150
+
>>> df.gt(other)
+ cost revenue
+A False False
+B False False
+C False True
+D False False
+
Compare to a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'cost': [250, 150, 100, 150, 300, 220],
+... 'revenue': [100, 250, 300, 200, 175, 225]},
+... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
+... ['A', 'B', 'C', 'A', 'B', 'C']])
+>>> df_multindex
+ cost revenue
+Q1 A 250 100
+ B 150 250
+ C 100 300
+Q2 A 150 200
+ B 300 175
+ C 220 225
+
>>> df.le(df_multindex, level=1)
+ cost revenue
+Q1 A True True
+ B True True
+ C True True
+Q2 A False True
+ B True False
+ C True False
+
le
(
other
, axis='columns'
, level=None
)
Get Less than or equal to of dataframe and other, element-wise (binary operator le
).
Among flexible wrappers (eq
, ne
, le
, lt
, ge
, gt
) to comparison
+operators.
Equivalent to ==
, !=
, <=
, <
, >=
, >
with support to choose axis
+(rows or columns) and level for comparison.
other
+(scalar, sequence, Series, or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'}, default 'columns')
+— Whether to compare by the index (0 or 'index') or columns(1 or 'columns').
+level
+(int or label)
+— Broadcast across a level, matching Index values on the passedMultiIndex level.
+Result of the comparison.
DataFrame.eq : Compare DataFrames for equality elementwise.DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise.
+Notes
+Mismatched indices will be unioned together.
+NaN
values are considered different (i.e. NaN
!= NaN
).
>>> df = pd.DataFrame({'cost': [250, 150, 100],... 'revenue': [100, 250, 300]},
+... index=['A', 'B', 'C'])
+>>> df
+ cost revenue
+A 250 100
+B 150 250
+C 100 300
+
Comparison with a scalar, using either the operator or method:
+>>> df == 100
+ cost revenue
+A False True
+B False False
+C True False
+
>>> df.eq(100)
+ cost revenue
+A False True
+B False False
+C True False
+
When other
is a :class:Series
, the columns of a DataFrame are aligned
+with the index of other
and broadcast:
>>> df != pd.Series([100, 250], index=["cost", "revenue"])
+ cost revenue
+A True True
+B True False
+C False True
+
Use the method to control the broadcast axis:
+>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index')
+ cost revenue
+A True False
+B True True
+C True True
+D True True
+
When comparing to an arbitrary sequence, the number of columns must
+match the number elements in other
:
>>> df == [250, 100]
+ cost revenue
+A True True
+B False False
+C False False
+
Use the method to control the axis:
+>>> df.eq([250, 250, 100], axis='index')
+ cost revenue
+A True False
+B False True
+C True False
+
Compare to a DataFrame of different shape.
+>>> other = pd.DataFrame({'revenue': [300, 250, 100, 150]},
+... index=['A', 'B', 'C', 'D'])
+>>> other
+ revenue
+A 300
+B 250
+C 100
+D 150
+
>>> df.gt(other)
+ cost revenue
+A False False
+B False False
+C False True
+D False False
+
Compare to a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'cost': [250, 150, 100, 150, 300, 220],
+... 'revenue': [100, 250, 300, 200, 175, 225]},
+... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
+... ['A', 'B', 'C', 'A', 'B', 'C']])
+>>> df_multindex
+ cost revenue
+Q1 A 250 100
+ B 150 250
+ C 100 300
+Q2 A 150 200
+ B 300 175
+ C 220 225
+
>>> df.le(df_multindex, level=1)
+ cost revenue
+Q1 A True True
+ B True True
+ C True True
+Q2 A False True
+ B True False
+ C True False
+
lt
(
other
, axis='columns'
, level=None
)
Get Less than of dataframe and other, element-wise (binary operator lt
).
Among flexible wrappers (eq
, ne
, le
, lt
, ge
, gt
) to comparison
+operators.
Equivalent to ==
, !=
, <=
, <
, >=
, >
with support to choose axis
+(rows or columns) and level for comparison.
other
+(scalar, sequence, Series, or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'}, default 'columns')
+— Whether to compare by the index (0 or 'index') or columns(1 or 'columns').
+level
+(int or label)
+— Broadcast across a level, matching Index values on the passedMultiIndex level.
+Result of the comparison.
DataFrame.eq : Compare DataFrames for equality elementwise.DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise.
+Notes
+Mismatched indices will be unioned together.
+NaN
values are considered different (i.e. NaN
!= NaN
).
>>> df = pd.DataFrame({'cost': [250, 150, 100],... 'revenue': [100, 250, 300]},
+... index=['A', 'B', 'C'])
+>>> df
+ cost revenue
+A 250 100
+B 150 250
+C 100 300
+
Comparison with a scalar, using either the operator or method:
+>>> df == 100
+ cost revenue
+A False True
+B False False
+C True False
+
>>> df.eq(100)
+ cost revenue
+A False True
+B False False
+C True False
+
When other
is a :class:Series
, the columns of a DataFrame are aligned
+with the index of other
and broadcast:
>>> df != pd.Series([100, 250], index=["cost", "revenue"])
+ cost revenue
+A True True
+B True False
+C False True
+
Use the method to control the broadcast axis:
+>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index')
+ cost revenue
+A True False
+B True True
+C True True
+D True True
+
When comparing to an arbitrary sequence, the number of columns must
+match the number elements in other
:
>>> df == [250, 100]
+ cost revenue
+A True True
+B False False
+C False False
+
Use the method to control the axis:
+>>> df.eq([250, 250, 100], axis='index')
+ cost revenue
+A True False
+B False True
+C True False
+
Compare to a DataFrame of different shape.
+>>> other = pd.DataFrame({'revenue': [300, 250, 100, 150]},
+... index=['A', 'B', 'C', 'D'])
+>>> other
+ revenue
+A 300
+B 250
+C 100
+D 150
+
>>> df.gt(other)
+ cost revenue
+A False False
+B False False
+C False True
+D False False
+
Compare to a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'cost': [250, 150, 100, 150, 300, 220],
+... 'revenue': [100, 250, 300, 200, 175, 225]},
+... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
+... ['A', 'B', 'C', 'A', 'B', 'C']])
+>>> df_multindex
+ cost revenue
+Q1 A 250 100
+ B 150 250
+ C 100 300
+Q2 A 150 200
+ B 300 175
+ C 220 225
+
>>> df.le(df_multindex, level=1)
+ cost revenue
+Q1 A True True
+ B True True
+ C True True
+Q2 A False True
+ B True False
+ C True False
+
ge
(
other
, axis='columns'
, level=None
)
Get Greater than or equal to of dataframe and other, element-wise (binary operator ge
).
Among flexible wrappers (eq
, ne
, le
, lt
, ge
, gt
) to comparison
+operators.
Equivalent to ==
, !=
, <=
, <
, >=
, >
with support to choose axis
+(rows or columns) and level for comparison.
other
+(scalar, sequence, Series, or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'}, default 'columns')
+— Whether to compare by the index (0 or 'index') or columns(1 or 'columns').
+level
+(int or label)
+— Broadcast across a level, matching Index values on the passedMultiIndex level.
+Result of the comparison.
DataFrame.eq : Compare DataFrames for equality elementwise.DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise.
+Notes
+Mismatched indices will be unioned together.
+NaN
values are considered different (i.e. NaN
!= NaN
).
>>> df = pd.DataFrame({'cost': [250, 150, 100],... 'revenue': [100, 250, 300]},
+... index=['A', 'B', 'C'])
+>>> df
+ cost revenue
+A 250 100
+B 150 250
+C 100 300
+
Comparison with a scalar, using either the operator or method:
+>>> df == 100
+ cost revenue
+A False True
+B False False
+C True False
+
>>> df.eq(100)
+ cost revenue
+A False True
+B False False
+C True False
+
When other
is a :class:Series
, the columns of a DataFrame are aligned
+with the index of other
and broadcast:
>>> df != pd.Series([100, 250], index=["cost", "revenue"])
+ cost revenue
+A True True
+B True False
+C False True
+
Use the method to control the broadcast axis:
+>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index')
+ cost revenue
+A True False
+B True True
+C True True
+D True True
+
When comparing to an arbitrary sequence, the number of columns must
+match the number elements in other
:
>>> df == [250, 100]
+ cost revenue
+A True True
+B False False
+C False False
+
Use the method to control the axis:
+>>> df.eq([250, 250, 100], axis='index')
+ cost revenue
+A True False
+B False True
+C True False
+
Compare to a DataFrame of different shape.
+>>> other = pd.DataFrame({'revenue': [300, 250, 100, 150]},
+... index=['A', 'B', 'C', 'D'])
+>>> other
+ revenue
+A 300
+B 250
+C 100
+D 150
+
>>> df.gt(other)
+ cost revenue
+A False False
+B False False
+C False True
+D False False
+
Compare to a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'cost': [250, 150, 100, 150, 300, 220],
+... 'revenue': [100, 250, 300, 200, 175, 225]},
+... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
+... ['A', 'B', 'C', 'A', 'B', 'C']])
+>>> df_multindex
+ cost revenue
+Q1 A 250 100
+ B 150 250
+ C 100 300
+Q2 A 150 200
+ B 300 175
+ C 220 225
+
>>> df.le(df_multindex, level=1)
+ cost revenue
+Q1 A True True
+ B True True
+ C True True
+Q2 A False True
+ B True False
+ C True False
+
gt
(
other
, axis='columns'
, level=None
)
Get Greater than of dataframe and other, element-wise (binary operator gt
).
Among flexible wrappers (eq
, ne
, le
, lt
, ge
, gt
) to comparison
+operators.
Equivalent to ==
, !=
, <=
, <
, >=
, >
with support to choose axis
+(rows or columns) and level for comparison.
other
+(scalar, sequence, Series, or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'}, default 'columns')
+— Whether to compare by the index (0 or 'index') or columns(1 or 'columns').
+level
+(int or label)
+— Broadcast across a level, matching Index values on the passedMultiIndex level.
+Result of the comparison.
DataFrame.eq : Compare DataFrames for equality elementwise.DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise.
+Notes
+Mismatched indices will be unioned together.
+NaN
values are considered different (i.e. NaN
!= NaN
).
>>> df = pd.DataFrame({'cost': [250, 150, 100],... 'revenue': [100, 250, 300]},
+... index=['A', 'B', 'C'])
+>>> df
+ cost revenue
+A 250 100
+B 150 250
+C 100 300
+
Comparison with a scalar, using either the operator or method:
+>>> df == 100
+ cost revenue
+A False True
+B False False
+C True False
+
>>> df.eq(100)
+ cost revenue
+A False True
+B False False
+C True False
+
When other
is a :class:Series
, the columns of a DataFrame are aligned
+with the index of other
and broadcast:
>>> df != pd.Series([100, 250], index=["cost", "revenue"])
+ cost revenue
+A True True
+B True False
+C False True
+
Use the method to control the broadcast axis:
+>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index')
+ cost revenue
+A True False
+B True True
+C True True
+D True True
+
When comparing to an arbitrary sequence, the number of columns must
+match the number elements in other
:
>>> df == [250, 100]
+ cost revenue
+A True True
+B False False
+C False False
+
Use the method to control the axis:
+>>> df.eq([250, 250, 100], axis='index')
+ cost revenue
+A True False
+B False True
+C True False
+
Compare to a DataFrame of different shape.
+>>> other = pd.DataFrame({'revenue': [300, 250, 100, 150]},
+... index=['A', 'B', 'C', 'D'])
+>>> other
+ revenue
+A 300
+B 250
+C 100
+D 150
+
>>> df.gt(other)
+ cost revenue
+A False False
+B False False
+C False True
+D False False
+
Compare to a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'cost': [250, 150, 100, 150, 300, 220],
+... 'revenue': [100, 250, 300, 200, 175, 225]},
+... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
+... ['A', 'B', 'C', 'A', 'B', 'C']])
+>>> df_multindex
+ cost revenue
+Q1 A 250 100
+ B 150 250
+ C 100 300
+Q2 A 150 200
+ B 300 175
+ C 220 225
+
>>> df.le(df_multindex, level=1)
+ cost revenue
+Q1 A True True
+ B True True
+ C True True
+Q2 A False True
+ B True False
+ C True False
+
add
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Addition of dataframe and other, element-wise (binary operator add
).
Equivalent to dataframe + other
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, radd
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
radd
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Addition of dataframe and other, element-wise (binary operator radd
).
Equivalent to other + dataframe
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, add
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
sub
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Subtraction of dataframe and other, element-wise (binary operator sub
).
Equivalent to dataframe - other
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, rsub
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
rsub
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Subtraction of dataframe and other, element-wise (binary operator rsub
).
Equivalent to other - dataframe
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, sub
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
mul
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Multiplication of dataframe and other, element-wise (binary operator mul
).
Equivalent to dataframe * other
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, rmul
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
rmul
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Multiplication of dataframe and other, element-wise (binary operator rmul
).
Equivalent to other * dataframe
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, mul
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
truediv
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Floating division of dataframe and other, element-wise (binary operator truediv
).
Equivalent to dataframe / other
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, rtruediv
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
rtruediv
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Floating division of dataframe and other, element-wise (binary operator rtruediv
).
Equivalent to other / dataframe
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, truediv
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
floordiv
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Integer division of dataframe and other, element-wise (binary operator floordiv
).
Equivalent to dataframe // other
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, rfloordiv
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
rfloordiv
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Integer division of dataframe and other, element-wise (binary operator rfloordiv
).
Equivalent to other // dataframe
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, floordiv
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
mod
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Modulo of dataframe and other, element-wise (binary operator mod
).
Equivalent to dataframe % other
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, rmod
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
rmod
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Modulo of dataframe and other, element-wise (binary operator rmod
).
Equivalent to other % dataframe
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, mod
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
pow
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Exponential power of dataframe and other, element-wise (binary operator pow
).
Equivalent to dataframe ** other
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, rpow
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
rpow
(
other
, axis='columns'
, level=None
, fill_value=None
)
Get Exponential power of dataframe and other, element-wise (binary operator rpow
).
Equivalent to other ** dataframe
, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, pow
.
Among flexible wrappers (add
, sub
, mul
, div
, floordiv
, mod
, pow
) to
+arithmetic operators: +
, -
, *
, /
, //
, %
, **
.
other
+(scalar, sequence, Series, dict or DataFrame)
+— Any single or multiple element data structure, or list-like object.axis
+({0 or 'index', 1 or 'columns'})
+— Whether to compare by the index (0 or 'index') or columns.(1 or 'columns'). For Series input, axis to match Series index on.
+level
+(int or label)
+— Broadcast across a level, matching Index values on thepassed MultiIndex level.
+fill_value
+(float or None, default None)
+— Fill existing missing (NaN) values, and any new element needed forsuccessful DataFrame alignment, with this value before computation.
+If data in both corresponding DataFrame locations is missing
+the result will be missing.
+Result of the arithmetic operation.
DataFrame.add : Add DataFrames.DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power.
+Notes
+Mismatched indices will be unioned together.
+>>> df = pd.DataFrame({'angles': [0, 3, 4],... 'degrees': [360, 180, 360]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
Add a scalar with operator version which return the same +results.
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
Divide by constant with reverse version.
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
Subtract a list and Series by axis with operator version.
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
Multiply a dictionary by axis.
+>>> df.mul({'angles': 0, 'degrees': 2})
+ angles degrees
+circle 0 720
+triangle 0 360
+rectangle 0 720
+
>>> df.mul({'circle': 0, 'triangle': 2, 'rectangle': 3}, axis='index')
+ angles degrees
+circle 0 0
+triangle 6 360
+rectangle 12 1080
+
Multiply a DataFrame of different shape with operator version.
+>>> other = pd.DataFrame({'angles': [0, 3, 4]},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
Divide by a MultiIndex by level.
+>>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+
compare
(
other
, align_axis=1
, keep_shape=False
, keep_equal=False
, result_names=('self', 'other')
)
Compare to another DataFrame and show the differences.
other
+(DataFrame)
+— Object to compare with.align_axis
+({0 or 'index', 1 or 'columns'}, default 1)
+— Determine which axis to align the comparison on.keep_shape
+(bool, default False)
+— If true, all rows and columns are kept.Otherwise, only the ones with different values are kept.
+keep_equal
+(bool, default False)
+— If true, the result keeps values that are equal.Otherwise, equal values are shown as NaNs.
+result_names
+(tuple, default ('self', 'other'))
+— Set the dataframes names in the comparison.DataFrame that shows the differences stacked side by side.
The resulting index will be a MultiIndex with 'self' and 'other' +stacked alternately at the inner level.
+ValueError
+
+— When the two DataFrames don't have identical labels or shape.Series.compare : Compare with another Series and show differences.DataFrame.equals : Test whether two objects contain the same elements.
+Notes
+Matching NaNs will not appear as a difference.
+Can only compare identically-labeled +(i.e. same shape, identical row and column labels) DataFrames
+>>> df = pd.DataFrame(... {
+... "col1": ["a", "a", "b", "b", "a"],
+... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
+... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
+... },
+... columns=["col1", "col2", "col3"],
+... )
+>>> df
+ col1 col2 col3
+0 a 1.0 1.0
+1 a 2.0 2.0
+2 b 3.0 3.0
+3 b NaN 4.0
+4 a 5.0 5.0
+
>>> df2 = df.copy()
+>>> df2.loc[0, 'col1'] = 'c'
+>>> df2.loc[2, 'col3'] = 4.0
+>>> df2
+ col1 col2 col3
+0 c 1.0 1.0
+1 a 2.0 2.0
+2 b 3.0 4.0
+3 b NaN 4.0
+4 a 5.0 5.0
+
Align the differences on columns
+>>> df.compare(df2)
+ col1 col3
+ self other self other
+0 a c NaN NaN
+2 NaN NaN 3.0 4.0
+
Assign result_names
+>>> df.compare(df2, result_names=("left", "right"))
+ col1 col3
+ left right left right
+0 a c NaN NaN
+2 NaN NaN 3.0 4.0
+
Stack the differences on rows
+>>> df.compare(df2, align_axis=0)
+ col1 col3
+0 self a NaN
+ other c NaN
+2 self NaN 3.0
+ other NaN 4.0
+
Keep the equal values
+>>> df.compare(df2, keep_equal=True)
+ col1 col3
+ self other self other
+0 a c 1.0 1.0
+2 b b 3.0 4.0
+
Keep all original rows and columns
+>>> df.compare(df2, keep_shape=True)
+ col1 col2 col3
+ self other self other self other
+0 a c NaN NaN NaN NaN
+1 NaN NaN NaN NaN NaN NaN
+2 NaN NaN NaN NaN 3.0 4.0
+3 NaN NaN NaN NaN NaN NaN
+4 NaN NaN NaN NaN NaN NaN
+
Keep all original rows and columns and also all original values
+>>> df.compare(df2, keep_shape=True, keep_equal=True)
+ col1 col2 col3
+ self other self other self other
+0 a c 1.0 1.0 1.0 1.0
+1 a a 2.0 2.0 2.0 2.0
+2 b b 3.0 3.0 3.0 4.0
+3 b b NaN NaN 4.0 4.0
+4 a a 5.0 5.0 5.0 5.0
+
combine
(
other
, func
, fill_value=None
, overwrite=True
)
Perform column-wise combine with another DataFrame.
Combines a DataFrame with other
DataFrame using func
+to element-wise combine columns. The row and column indexes of the
+resulting DataFrame will be the union of the two.
other
+(DataFrame)
+— The DataFrame to merge column-wise.func
+(function)
+— Function that takes two series as inputs and return a Series or ascalar. Used to merge the two dataframes column by columns.
+fill_value
+(scalar value, default None)
+— The value to fill NaNs with prior to passing any column to themerge func.
+overwrite
+(bool, default True)
+— If True, columns in self
that do not exist in other
will beoverwritten with NaNs.
+Combination of the provided DataFrames.
DataFrame.combinefirst : Combine two DataFrame objects and default to non-null values in frame calling the method.
+Combine using a simple function that chooses the smaller column.
>>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
+>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+>>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
+>>> df1.combine(df2, take_smaller)
+ A B
+0 0 3
+1 0 3
+
Example using a true element-wise combine function.
+>>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
+>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+>>> df1.combine(df2, np.minimum)
+ A B
+0 1 2
+1 0 3
+
Using fill_value
fills Nones prior to passing the column to the
+merge function.
>>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
+>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+>>> df1.combine(df2, take_smaller, fill_value=-5)
+ A B
+0 0 -5.0
+1 0 4.0
+
However, if the same element in both dataframes is None, that None +is preserved
+>>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
+>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
+>>> df1.combine(df2, take_smaller, fill_value=-5)
+ A B
+0 0 -5.0
+1 0 3.0
+
Example that demonstrates the use of overwrite
and behavior when
+the axis differ between the dataframes.
>>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
+>>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
+>>> df1.combine(df2, take_smaller)
+ A B C
+0 NaN NaN NaN
+1 NaN 3.0 -10.0
+2 NaN 3.0 1.0
+
>>> df1.combine(df2, take_smaller, overwrite=False)
+ A B C
+0 0.0 NaN NaN
+1 0.0 3.0 -10.0
+2 NaN 3.0 1.0
+
Demonstrating the preference of the passed in dataframe.
+>>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
+>>> df2.combine(df1, take_smaller)
+ A B C
+0 0.0 NaN NaN
+1 0.0 3.0 NaN
+2 NaN 3.0 NaN
+
>>> df2.combine(df1, take_smaller, overwrite=False)
+ A B C
+0 0.0 NaN NaN
+1 0.0 3.0 1.0
+2 NaN 3.0 1.0
+
combine_first
(
other
)
Update null elements with value in the same location in other
.
Combine two DataFrame objects by filling null values in one DataFrame +with non-null values from other DataFrame. The row and column indexes +of the resulting DataFrame will be the union of the two. The resulting +dataframe contains the 'first' dataframe values and overrides the +second one values where both first.loc[index, col] and +second.loc[index, col] are not missing values, upon calling +first.combine_first(second).
+other
+(DataFrame)
+— Provided DataFrame to use to fill null values.The result of combining the provided DataFrame with the other object.
DataFrame.combine : Perform series-wise operation on two DataFrames using a given function.
+>>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+>>> df1.combine_first(df2)
+ A B
+0 1.0 3.0
+1 0.0 4.0
+
Null values still persist if the location of that null value
+does not exist in other
>>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
+>>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
+>>> df1.combine_first(df2)
+ A B C
+0 NaN 4.0 NaN
+1 0.0 3.0 1.0
+2 NaN 3.0 1.0
+
update
(
other
, join='left'
, overwrite=True
, filter_func=None
, errors='ignore'
)
Modify in place using non-NA values from another DataFrame.
Aligns on indices. There is no return value.
+other
+(DataFrame, or object coercible into a DataFrame)
+— Should have at least one matching index/column labelwith the original DataFrame. If a Series is passed,
+its name attribute must be set, and that will be
+used as the column name to align with the original DataFrame.
+join
+({'left'}, default 'left')
+— Only left join is implemented, keeping the index and columns of theoriginal object.
+overwrite
+(bool, default True)
+— How to handle non-NA values for overlapping keys:other
.filter_func
+(callable(1d-array) -> bool 1d-array, optional)
+— Can choose to replace values other than NA. Return True for valuesthat should be updated.
+errors
+({'raise', 'ignore'}, default 'ignore')
+— If 'raise', will raise a ValueError if the DataFrame and other
both contain non-NA data in the same place.
+This method directly changes calling object.
NotImplementedError
+
+— join != 'left'
ValueError
+
+— errors='raise'
and there's overlapping non-NA data.errors
is not either 'ignore'
or 'raise'
dict.update : Similar method for dictionaries.DataFrame.merge : For column(s)-on-column(s) operations.
+>>> df = pd.DataFrame({'A': [1, 2, 3],... 'B': [400, 500, 600]})
+>>> new_df = pd.DataFrame({'B': [4, 5, 6],
+... 'C': [7, 8, 9]})
+>>> df.update(new_df)
+>>> df
+ A B
+0 1 4
+1 2 5
+2 3 6
+
The DataFrame's length does not increase as a result of the update, +only values at matching index/column labels are updated.
+>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
+... 'B': ['x', 'y', 'z']})
+>>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
+>>> df.update(new_df)
+>>> df
+ A B
+0 a d
+1 b e
+2 c f
+
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
+... 'B': ['x', 'y', 'z']})
+>>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2])
+>>> df.update(new_df)
+>>> df
+ A B
+0 a d
+1 b y
+2 c f
+
For Series, its name attribute must be set.
+>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
+... 'B': ['x', 'y', 'z']})
+>>> new_column = pd.Series(['d', 'e', 'f'], name='B')
+>>> df.update(new_column)
+>>> df
+ A B
+0 a d
+1 b e
+2 c f
+
If other
contains NaNs the corresponding values are not updated
+in the original dataframe.
>>> df = pd.DataFrame({'A': [1, 2, 3],
+... 'B': [400., 500., 600.]})
+>>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
+>>> df.update(new_df)
+>>> df
+ A B
+0 1 4.0
+1 2 500.0
+2 3 6.0
+
groupby
(
by=None
, axis=<no_default>
, level=None
, as_index=True
, sort=True
, group_keys=True
, observed=<no_default>
, dropna=True
)
Group DataFrame using a mapper or by a Series of columns.
A groupby operation involves some combination of splitting the +object, applying a function, and combining the results. This can be +used to group large amounts of data and compute operations on these +groups.
+by
+(mapping, function, label, pd.Grouper or list of such)
+— Used to determine the groups for the groupby.If by
is a function, it's called on each value of the object's
+index. If a dict or Series is passed, the Series or dict VALUES
+will be used to determine the groups (the Series' values are first
+aligned; see .align()
method). If a list or ndarray of length
+equal to the selected axis is passed (see the groupby user guide
+<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>
_),
+the values are used as-is to determine the groups. A label or list
+of labels may be passed to group by the columns in self
.
+Notice that a tuple is interpreted as a (single) key.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Split along rows (0) or columns (1). For Series
this parameteris unused and defaults to 0.Will be removed and behave like axis=0 in a future version.
+For ``axis=1``, do ``frame.T.groupby(...)`` instead.
+
+level
+(int, level name, or sequence of such, default None)
+— If the axis is a MultiIndex (hierarchical), group by a particularlevel or levels. Do not specify both by
and level
.
+as_index
+(bool, default True)
+— Return object with group labels as theindex. Only relevant for DataFrame input. as_index=False is
+effectively "SQL-style" grouped output. This argument has no effect
+on filtrations (see the filtrations in the user guide
+<https://pandas.pydata.org/docs/dev/user_guide/groupby.html#filtration>
),
+such as head()
, tail()
, nth()
and in transformations
+(see the transformations in the user guide
+<https://pandas.pydata.org/docs/dev/user_guide/groupby.html#transformation>
).
+sort
+(bool, default True)
+— Sort group keys. Get better performance by turning this off.Note this does not influence the order of observations within each
+group. Groupby preserves the order of rows within each group. If False,
+the groups will appear in the same order as they did in the original DataFrame.
+This argument has no effect on filtrations (see the filtrations in the user guide
+<https://pandas.pydata.org/docs/dev/user_guide/groupby.html#filtration>
),
+such as head()
, tail()
, nth()
and in transformations
+(see the transformations in the user guide
+<https://pandas.pydata.org/docs/dev/user_guide/groupby.html#transformation>
).Specifying ``sort=False`` with an ordered categorical grouper will no
+longer sort the values.
+
+group_keys
+(bool, default True)
+— When calling apply and the by
argument produces a like-indexed(i.e. :ref:a transform <groupby.transform>
) result, add group keys to
+index to identify pieces. By default group keys are not included
+when the result's index (and column) labels match the inputs, and
+are included otherwise.group_keys
will no longer be ignored when the
+ result from apply
is a like-indexed Series or DataFrame.
+ Specify group_keys
explicitly to include the group keys or
+ not.group_keys
now defaults to True
.
+observed
+(bool, default False)
+— This only applies if any of the groupers are Categoricals.If True: only show observed values for categorical groupers.
+If False: show all values for categorical groupers.The default value will change to True in a future version of pandas.
+
+dropna
+(bool, default True)
+— If True, and if group keys contain NA values, NA values togetherwith row/column will be dropped.
+If False, NA values will also be treated as the key in groups.
+Returns a groupby object that contains information about the groups.
resample : Convenience method for frequency conversion and resampling of time series.
+Notes
+See the user guide
+<https://pandas.pydata.org/pandas-docs/stable/groupby.html>
__ for more
+detailed usage and examples, including splitting an object into groups,
+iterating through groups, selecting a group, aggregation, and more.
>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',... 'Parrot', 'Parrot'],
+... 'Max Speed': [380., 370., 24., 26.]})
+>>> df
+ Animal Max Speed
+0 Falcon 380.0
+1 Falcon 370.0
+2 Parrot 24.0
+3 Parrot 26.0
+>>> df.groupby(['Animal']).mean()
+ Max Speed
+Animal
+Falcon 375.0
+Parrot 25.0
+
Hierarchical Indexes
+We can groupby different levels of a hierarchical index
+using the level
parameter:
>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
+... ['Captive', 'Wild', 'Captive', 'Wild']]
+>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
+>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
+... index=index)
+>>> df
+ Max Speed
+Animal Type
+Falcon Captive 390.0
+ Wild 350.0
+Parrot Captive 30.0
+ Wild 20.0
+>>> df.groupby(level=0).mean()
+ Max Speed
+Animal
+Falcon 370.0
+Parrot 25.0
+>>> df.groupby(level="Type").mean()
+ Max Speed
+Type
+Captive 210.0
+Wild 185.0
+
We can also choose to include NA in group keys or not by setting
+dropna
parameter, the default setting is True
.
>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
+>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
+
>>> df.groupby(by=["b"]).sum()
+ a c
+b
+1.0 2 3
+2.0 2 5
+
>>> df.groupby(by=["b"], dropna=False).sum()
+ a c
+b
+1.0 2 3
+2.0 2 5
+NaN 1 4
+
>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
+>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
+
>>> df.groupby(by="a").sum()
+ b c
+a
+a 13.0 13.0
+b 12.3 123.0
+
>>> df.groupby(by="a", dropna=False).sum()
+ b c
+a
+a 13.0 13.0
+b 12.3 123.0
+NaN 12.3 33.0
+
When using .apply()
, use group_keys
to include or exclude the
+group keys. The group_keys
argument defaults to True
(include).
>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
+... 'Parrot', 'Parrot'],
+... 'Max Speed': [380., 370., 24., 26.]})
+>>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x)
+ Max Speed
+Animal
+Falcon 0 380.0
+ 1 370.0
+Parrot 2 24.0
+ 3 26.0
+
>>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x)
+ Max Speed
+0 380.0
+1 370.0
+2 24.0
+3 26.0
+
pivot
(
columns
, index=<no_default>
, values=<no_default>
)
Return reshaped DataFrame organized by given index / column values.
Reshape data (produce a "pivot" table) based on column values. Uses
+unique values from specified index
/ columns
to form axes of the
+resulting DataFrame. This function does not support data
+aggregation, multiple values will result in a MultiIndex in the
+columns. See the :ref:User Guide <reshaping>
for more on reshaping.
columns
+(str or object or a list of str)
+— Column to use to make new frame's columns.index
+(str or object or a list of str, optional)
+— Column to use to make new frame's index. If not given, uses existing index.values
+(str, object or a list of the previous, optional)
+— Column(s) to use for populating new frame's values. If notspecified, all remaining columns will be used and the result will
+have hierarchically indexed columns.
+Returns reshaped DataFrame.
ValueError
+
+— When there are any index
, columns
combinations with multiplevalues. DataFrame.pivot_table
when you need to aggregate.
+DataFrame.pivottable : Generalization of pivot that can handle duplicate values for one index/column pair. +DataFrame.unstack : Pivot based on the index values instead of a + column. +wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt.
+Notes
+For finer-tuned control, see hierarchical indexing documentation along +with the related stack/unstack methods.
+Reference :ref:the user guide <reshaping.pivot>
for more examples.
>>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',... 'two'],
+... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+... 'baz': [1, 2, 3, 4, 5, 6],
+... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
+>>> df
+ foo bar baz zoo
+0 one A 1 x
+1 one B 2 y
+2 one C 3 z
+3 two A 4 q
+4 two B 5 w
+5 two C 6 t
+
>>> df.pivot(index='foo', columns='bar', values='baz')
+bar A B C
+foo
+one 1 2 3
+two 4 5 6
+
>>> df.pivot(index='foo', columns='bar')['baz']
+bar A B C
+foo
+one 1 2 3
+two 4 5 6
+
>>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
+ baz zoo
+bar A B C A B C
+foo
+one 1 2 3 x y z
+two 4 5 6 q w t
+
You could also assign a list of column names or a list of index names.
+>>> df = pd.DataFrame({
+... "lev1": [1, 1, 1, 2, 2, 2],
+... "lev2": [1, 1, 2, 1, 1, 2],
+... "lev3": [1, 2, 1, 2, 1, 2],
+... "lev4": [1, 2, 3, 4, 5, 6],
+... "values": [0, 1, 2, 3, 4, 5]})
+>>> df
+ lev1 lev2 lev3 lev4 values
+0 1 1 1 1 0
+1 1 1 2 2 1
+2 1 2 1 3 2
+3 2 1 2 4 3
+4 2 1 1 5 4
+5 2 2 2 6 5
+
>>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")
+lev2 1 2
+lev3 1 2 1 2
+lev1
+1 0.0 1.0 2.0 NaN
+2 4.0 3.0 NaN 5.0
+
>>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")
+ lev3 1 2
+lev1 lev2
+ 1 1 0.0 1.0
+ 2 2.0 NaN
+ 2 1 4.0 3.0
+ 2 NaN 5.0
+
A ValueError is raised if there are any duplicates.
+>>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
+... "bar": ['A', 'A', 'B', 'C'],
+... "baz": [1, 2, 3, 4]})
+>>> df
+ foo bar baz
+0 one A 1
+1 one A 2
+2 two B 3
+3 two C 4
+
Notice that the first two rows are the same for our index
+and columns
arguments.
>>> df.pivot(index='foo', columns='bar', values='baz')
+Traceback (most recent call last):
+ ...
+ValueError: Index contains duplicate entries, cannot reshape
+
pivot_table
(
values=None
, index=None
, columns=None
, aggfunc='mean'
, fill_value=None
, margins=False
, dropna=True
, margins_name='All'
, observed=<no_default>
, sort=True
)
Create a spreadsheet-style pivot table as a DataFrame.
The levels in the pivot table will be stored in MultiIndex objects +(hierarchical indexes) on the index and columns of the result DataFrame.
+values
+(list-like or scalar, optional)
+— Column or columns to aggregate.index
+(column, Grouper, array, or list of the previous)
+— Keys to group by on the pivot table index. If a list is passed,it can contain any of the other types (except list). If an array is
+passed, it must be the same length as the data and will be used in
+the same manner as column values.
+columns
+(column, Grouper, array, or list of the previous)
+— Keys to group by on the pivot table column. If a list is passed,it can contain any of the other types (except list). If an array is
+passed, it must be the same length as the data and will be used in
+the same manner as column values.
+aggfunc
+(function, list of functions, dict, default "mean")
+— If a list of functions is passed, the resulting pivot table will havehierarchical columns whose top level are the function names
+(inferred from the function objects themselves).
+If a dict is passed, the key is column to aggregate and the value is
+function or list of functions. If margin=True
, aggfunc will be
+used to calculate the partial aggregates.
+fill_value
+(scalar, default None)
+— Value to replace missing values with (in the resulting pivot table,after aggregation).
+margins
+(bool, default False)
+— If margins=True
, special All
columns and rowswill be added with partial group aggregates across the categories
+on the rows and columns.
+dropna
+(bool, default True)
+— Do not include columns whose entries are all NaN. If True,rows with a NaN value in any column will be omitted before
+computing margins.
+margins_name
+(str, default 'All')
+— Name of the row / column that will contain the totalswhen margins is True.
+observed
+(bool, default False)
+— This only applies if any of the groupers are Categoricals.If True: only show observed values for categorical groupers.
+If False: show all values for categorical groupers.The default value of ``False`` is deprecated and will change to
+``True`` in a future version of pandas.
+
+sort
+(bool, default True)
+— Specifies if the result should be sorted.An Excel style pivot table.
DataFrame.pivot : Pivot without aggregation that can handle non-numeric data. +DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. +wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt.
+Notes
+Reference :ref:the user guide <reshaping.pivot>
for more examples.
>>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",... "bar", "bar", "bar", "bar"],
+... "B": ["one", "one", "one", "two", "two",
+... "one", "one", "two", "two"],
+... "C": ["small", "large", "large", "small",
+... "small", "large", "small", "small",
+... "large"],
+... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
+>>> df
+ A B C D E
+0 foo one small 1 2
+1 foo one large 2 4
+2 foo one large 2 5
+3 foo two small 3 5
+4 foo two small 3 6
+5 bar one large 4 6
+6 bar one small 5 8
+7 bar two small 6 9
+8 bar two large 7 9
+
This first example aggregates values by taking the sum.
+>>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
+... columns=['C'], aggfunc="sum")
+>>> table
+C large small
+A B
+bar one 4.0 5.0
+ two 7.0 6.0
+foo one 4.0 1.0
+ two NaN 6.0
+
We can also fill missing values using the fill_value
parameter.
>>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
+... columns=['C'], aggfunc="sum", fill_value=0)
+>>> table
+C large small
+A B
+bar one 4 5
+ two 7 6
+foo one 4 1
+ two 0 6
+
The next example aggregates by taking the mean across multiple columns.
+>>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+... aggfunc={'D': "mean", 'E': "mean"})
+>>> table
+ D E
+A C
+bar large 5.500000 7.500000
+ small 5.500000 8.500000
+foo large 2.000000 4.500000
+ small 2.333333 4.333333
+
We can also calculate multiple types of aggregations for any given +value column.
+>>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+... aggfunc={'D': "mean",
+... 'E': ["min", "max", "mean"]})
+>>> table
+ D E
+ mean max mean min
+A C
+bar large 5.500000 9 7.500000 6
+ small 5.500000 9 8.500000 8
+foo large 2.000000 5 4.500000 4
+ small 2.333333 6 4.333333 2
+
stack
(
level=-1
, dropna=<no_default>
, sort=<no_default>
, future_stack=False
)
Stack the prescribed level(s) from columns to index.
Return a reshaped DataFrame or Series having a multi-level +index with one or more new inner-most levels compared to the current +DataFrame. The new inner-most levels are created by pivoting the +columns of the current dataframe:
+level
+(int, str, list, default -1)
+— Level(s) to stack from the column axis onto the indexaxis, defined as one index or label, or a list of indices
+or labels.
+dropna
+(bool, default True)
+— Whether to drop rows in the resulting Frame/Series withmissing values. Stacking a column level onto the index
+axis can create combinations of index and column values
+that are missing from the original dataframe. See Examples
+section.
+sort
+(bool, default True)
+— Whether to sort the levels of the resulting MultiIndex.future_stack
+(bool, default False)
+— Whether to use the new implementation that will replace the currentimplementation in pandas 3.0. When True, dropna and sort have no impact
+on the result and must remain unspecified. See :ref:pandas 2.1.0 Release
+notes <whatsnew_210.enhancements.new_stack>
for more details.
+Stacked dataframe or series.
DataFrame.unstack : Unstack prescribed level(s) from index axis onto column axis. +DataFrame.pivot : Reshape dataframe from long format to wide + format. +DataFrame.pivottable : Create a spreadsheet-style pivot table + as a DataFrame.
+Notes
+The function is named by analogy with a collection of books +being reorganized from being side by side on a horizontal +position (the columns of the dataframe) to being stacked +vertically on top of each other (in the index of the +dataframe).
+Reference :ref:the user guide <reshaping.stacking>
for more examples.
Single level columns
>>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
+... index=['cat', 'dog'],
+... columns=['weight', 'height'])
+
Stacking a dataframe with a single level column axis returns a Series:
+>>> df_single_level_cols
+ weight height
+cat 0 1
+dog 2 3
+>>> df_single_level_cols.stack(future_stack=True)
+cat weight 0
+ height 1
+dog weight 2
+ height 3
+dtype: int64
+
Multi level columns: simple case
+>>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+... ('weight', 'pounds')])
+>>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
+... index=['cat', 'dog'],
+... columns=multicol1)
+
Stacking a dataframe with a multi-level column axis:
+>>> df_multi_level_cols1
+ weight
+ kg pounds
+cat 1 2
+dog 2 4
+>>> df_multi_level_cols1.stack(future_stack=True)
+ weight
+cat kg 1
+ pounds 2
+dog kg 2
+ pounds 4
+
Missing values
+>>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+... ('height', 'm')])
+>>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
+... index=['cat', 'dog'],
+... columns=multicol2)
+
It is common to have missing values when stacking a dataframe +with multi-level columns, as the stacked dataframe typically +has more values than the original dataframe. Missing values +are filled with NaNs:
+>>> df_multi_level_cols2
+ weight height
+ kg m
+cat 1.0 2.0
+dog 3.0 4.0
+>>> df_multi_level_cols2.stack(future_stack=True)
+ weight height
+cat kg 1.0 NaN
+ m NaN 2.0
+dog kg 3.0 NaN
+ m NaN 4.0
+
Prescribing the level(s) to be stacked
+The first parameter controls which level or levels are stacked:
+>>> df_multi_level_cols2.stack(0, future_stack=True)
+ kg m
+cat weight 1.0 NaN
+ height NaN 2.0
+dog weight 3.0 NaN
+ height NaN 4.0
+>>> df_multi_level_cols2.stack([0, 1], future_stack=True)
+cat weight kg 1.0
+ height m 2.0
+dog weight kg 3.0
+ height m 4.0
+dtype: float64
+
explode
(
column
, ignore_index=False
)
Transform each element of a list-like to a row, replicating index values.
column
+(IndexLabel)
+— Column(s) to explode.For multiple columns, specify a non-empty list with each element
+be str or tuple, and all specified columns their list-like data
+on same row of the frame must have matching length.ignore_index
+(bool, default False)
+— If True, the resulting index will be labeled 0, 1, …, n - 1.Exploded lists to rows of the subset columns;index will be duplicated for these rows.
+ValueError
+
+— DataFrame.unstack : Pivot a level of the (necessarily hierarchical) index labels. +DataFrame.melt : Unpivot a DataFrame from wide format to long format. +Series.explode : Explode a DataFrame from list-like columns to long format.
+Notes
+This routine will explode list-likes including lists, tuples, sets, +Series, and np.ndarray. The result dtype of the subset rows will +be object. Scalars will be returned unchanged, and empty list-likes will +result in a np.nan for that row. In addition, the ordering of rows in the +output will be non-deterministic when exploding sets.
+Reference :ref:the user guide <reshaping.explode>
for more examples.
>>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],... 'B': 1,
+... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
+>>> df
+ A B C
+0 [0, 1, 2] 1 [a, b, c]
+1 foo 1 NaN
+2 [] 1 []
+3 [3, 4] 1 [d, e]
+
Single-column explode.
+>>> df.explode('A')
+ A B C
+0 0 1 [a, b, c]
+0 1 1 [a, b, c]
+0 2 1 [a, b, c]
+1 foo 1 NaN
+2 NaN 1 []
+3 3 1 [d, e]
+3 4 1 [d, e]
+
Multi-column explode.
+>>> df.explode(list('AC'))
+ A B C
+0 0 1 a
+0 1 1 b
+0 2 1 c
+1 foo 1 NaN
+2 NaN 1 NaN
+3 3 1 d
+3 4 1 e
+
unstack
(
level=-1
, fill_value=None
, sort=True
)
Pivot a level of the (necessarily hierarchical) index labels.
Returns a DataFrame having a new level of column labels whose inner-most level +consists of the pivoted index labels.
+If the index is not a MultiIndex, the output will be a Series +(the analogue of stack when the columns are not a MultiIndex).
+level
+(int, str, or list of these, default -1 (last level))
+— Level(s) of index to unstack, can pass level name.fill_value
+(int, str or dict)
+— Replace NaN with this value if the unstack produces missing values.sort
+(bool, default True)
+— Sort the level(s) in the resulting MultiIndex columns.DataFrame.pivot : Pivot a table based on column values.DataFrame.stack : Pivot a level of the column labels (inverse operation
+ from unstack
).
Notes
+Reference :ref:the user guide <reshaping.stacking>
for more examples.
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),... ('two', 'a'), ('two', 'b')])
+>>> s = pd.Series(np.arange(1.0, 5.0), index=index)
+>>> s
+one a 1.0
+ b 2.0
+two a 3.0
+ b 4.0
+dtype: float64
+
>>> s.unstack(level=-1)
+ a b
+one 1.0 2.0
+two 3.0 4.0
+
>>> s.unstack(level=0)
+ one two
+a 1.0 3.0
+b 2.0 4.0
+
>>> df = s.unstack(level=0)
+>>> df.unstack()
+one a 1.0
+ b 2.0
+two a 3.0
+ b 4.0
+dtype: float64
+
melt
(
id_vars=None
, value_vars=None
, var_name=None
, value_name='value'
, col_level=None
, ignore_index=True
)
Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
This function is useful to massage a DataFrame into a format where one
+or more columns are identifier variables (id_vars
), while all other
+columns, considered measured variables (value_vars
), are "unpivoted" to
+the row axis, leaving just two non-identifier columns, 'variable' and
+'value'.
id_vars
+(scalar, tuple, list, or ndarray, optional)
+— Column(s) to use as identifier variables.value_vars
+(scalar, tuple, list, or ndarray, optional)
+— Column(s) to unpivot. If not specified, uses all columns thatare not set as id_vars
.
+var_name
+(scalar, default None)
+— Name to use for the 'variable' column. If None it usesframe.columns.name
or 'variable'.
+value_name
+(scalar, default 'value')
+— Name to use for the 'value' column, can't be an existing column label.col_level
+(scalar, optional)
+— If columns are a MultiIndex then use this level to melt.ignore_index
+(bool, default True)
+— If True, original index is ignored. If False, the original index is retained.Index labels will be repeated as necessary.
+Unpivoted DataFrame.
melt : Identical method.pivot_table : Create a spreadsheet-style pivot table as a DataFrame. +DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. +DataFrame.explode : Explode a DataFrame from list-like + columns to long format.
+Notes
+Reference :ref:the user guide <reshaping.melt>
for more examples.
>>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},... 'B': {0: 1, 1: 3, 2: 5},
+... 'C': {0: 2, 1: 4, 2: 6}})
+>>> df
+ A B C
+0 a 1 2
+1 b 3 4
+2 c 5 6
+
>>> df.melt(id_vars=['A'], value_vars=['B'])
+ A variable value
+0 a B 1
+1 b B 3
+2 c B 5
+
>>> df.melt(id_vars=['A'], value_vars=['B', 'C'])
+ A variable value
+0 a B 1
+1 b B 3
+2 c B 5
+3 a C 2
+4 b C 4
+5 c C 6
+
The names of 'variable' and 'value' columns can be customized:
+>>> df.melt(id_vars=['A'], value_vars=['B'],
+... var_name='myVarname', value_name='myValname')
+ A myVarname myValname
+0 a B 1
+1 b B 3
+2 c B 5
+
Original index values can be kept around:
+>>> df.melt(id_vars=['A'], value_vars=['B', 'C'], ignore_index=False)
+ A variable value
+0 a B 1
+1 b B 3
+2 c B 5
+0 a C 2
+1 b C 4
+2 c C 6
+
If you have multi-index columns:
+>>> df.columns = [list('ABC'), list('DEF')]
+>>> df
+ A B C
+ D E F
+0 a 1 2
+1 b 3 4
+2 c 5 6
+
>>> df.melt(col_level=0, id_vars=['A'], value_vars=['B'])
+ A variable value
+0 a B 1
+1 b B 3
+2 c B 5
+
>>> df.melt(id_vars=[('A', 'D')], value_vars=[('B', 'E')])
+ (A, D) variable_0 variable_1 value
+0 a B E 1
+1 b B E 3
+2 c B E 5
+
diff
(
periods=1
, axis=0
)
First discrete difference of element.
Calculates the difference of a DataFrame element compared with another +element in the DataFrame (default is element in previous row).
+periods
+(int, default 1)
+— Periods to shift for calculating difference, accepts negativevalues.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Take difference over rows (0) or columns (1).First differences of the Series.
DataFrame.pct_change: Percent change over given number of periods.DataFrame.shift: Shift index by desired number of periods with an + optional time freq. +Series.diff: First discrete difference of object.
+Notes
+For boolean dtypes, this uses :meth:operator.xor
rather than
+:meth:operator.sub
.
+The result is calculated according to current dtype in DataFrame,
+however dtype of the result is always float64.
w
, +, +) +f +c +1 +4 +9 +6 +5 +6
+) +c +N +0 +0 +0 +0 +0
+n
+) +c +0 +3 +7 +3 +0 +8
+w
+) +c +N +N +N +0 +0 +0
+w
+) +c +0 +0 +0 +0 +0 +N
+e
+) +) +a +N +0
+aggregate
(
func=None
, axis=0
, *args
, **kwargs
)
Aggregate using one or more operations over the specified axis.
func
+(function, str, list or dict)
+— Function to use for aggregating the data. If a function, must eitherwork when passed a DataFrame or when passed to DataFrame.apply.[np.sum, 'mean']
axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— If 0 or 'index': apply function to each column.If 1 or 'columns': apply function to each row.
+*args
+
+— Positional arguments to pass to func
.**kwargs
+
+— Keyword arguments to pass to func
.:
n +n +s
+DataFrame.apply : Perform any type of operations.DataFrame.transform : Perform transformation type operations. +pandas.DataFrame.groupby : Perform operations over groups. +pandas.DataFrame.resample : Perform operations over resampled bins. +pandas.DataFrame.rolling : Perform operations over rolling window. +pandas.DataFrame.expanding : Perform operations over expanding window. +pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential + weighted window.
+Notes
+The aggregation operations are always performed over an axis, either the
+index (default) or the column axis. This behavior is different from
+numpy
aggregation functions (mean
, median
, prod
, sum
, std
,
+var
), where the default is to compute the aggregation of the flattened
+array, e.g., numpy.mean(arr_2d)
as opposed to
+numpy.mean(arr_2d, axis=0)
.
agg
is an alias for aggregate
. Use the alias.
Functions that mutate the passed object can produce unexpected
+behavior or errors and are not supported. See :ref:gotchas.udf-mutation
+for more details.
A passed user-defined-function will be passed a Series for evaluation.
+>>> df = pd.DataFrame([[1, 2, 3],... [4, 5, 6],
+... [7, 8, 9],
+... [np.nan, np.nan, np.nan]],
+... columns=['A', 'B', 'C'])
+
Aggregate these functions over the rows.
+>>> df.agg(['sum', 'min'])
+ A B C
+sum 12.0 15.0 18.0
+min 1.0 2.0 3.0
+
Different aggregations per column.
+>>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
+ A B
+sum 12.0 NaN
+min 1.0 2.0
+max NaN 8.0
+
Aggregate different functions over the columns and rename the index of the resulting +DataFrame.
+>>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean'))
+ A B C
+x 7.0 NaN NaN
+y NaN 2.0 NaN
+z NaN NaN 6.0
+
Aggregate over the columns.
+>>> df.agg("mean", axis="columns")
+0 2.0
+1 5.0
+2 8.0
+3 NaN
+dtype: float64
+
transform
(
func
, axis=0
, *args
, **kwargs
)
Call func
on self producing a DataFrame with the same axis shape as self.
func
+(function, str, list-like or dict-like)
+— Function to use for transforming the data. If a function, must eitherwork when passed a DataFrame or when passed to DataFrame.apply. If func
+is both list-like and dict-like, dict-like behavior takes precedence.[np.exp, 'sqrt']
axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— If 0 or 'index': apply function to each column.If 1 or 'columns': apply function to each row.
+*args
+
+— Positional arguments to pass to func
.**kwargs
+
+— Keyword arguments to pass to func
.A DataFrame that must have the same length as self.
DataFrame.agg : Only perform aggregating type operations.DataFrame.apply : Invoke function on a DataFrame.
+Notes
+Functions that mutate the passed object can produce unexpected
+behavior or errors and are not supported. See :ref:gotchas.udf-mutation
+for more details.
>>> df = pd.DataFrame({'A': range(3), 'B': range(1, 4)})>>> df
+ A B
+0 0 1
+1 1 2
+2 2 3
+>>> df.transform(lambda x: x + 1)
+ A B
+0 1 2
+1 2 3
+2 3 4
+
Even though the resulting DataFrame must have the same length as the +input DataFrame, it is possible to provide several input functions:
+>>> s = pd.Series(range(3))
+>>> s
+0 0
+1 1
+2 2
+dtype: int64
+>>> s.transform([np.sqrt, np.exp])
+ sqrt exp
+0 0.000000 1.000000
+1 1.000000 2.718282
+2 1.414214 7.389056
+
You can call transform on a GroupBy object:
+>>> df = pd.DataFrame({
+... "Date": [
+... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05",
+... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"],
+... "Data": [5, 8, 6, 1, 50, 100, 60, 120],
+... })
+>>> df
+ Date Data
+0 2015-05-08 5
+1 2015-05-07 8
+2 2015-05-06 6
+3 2015-05-05 1
+4 2015-05-08 50
+5 2015-05-07 100
+6 2015-05-06 60
+7 2015-05-05 120
+>>> df.groupby('Date')['Data'].transform('sum')
+0 55
+1 108
+2 66
+3 121
+4 55
+5 108
+6 66
+7 121
+Name: Data, dtype: int64
+
>>> df = pd.DataFrame({
+... "c": [1, 1, 1, 2, 2, 2, 2],
+... "type": ["m", "n", "o", "m", "m", "n", "n"]
+... })
+>>> df
+ c type
+0 1 m
+1 1 n
+2 1 o
+3 2 m
+4 2 m
+5 2 n
+6 2 n
+>>> df['size'] = df.groupby('c')['type'].transform(len)
+>>> df
+ c type size
+0 1 m 3
+1 1 n 3
+2 1 o 3
+3 2 m 4
+4 2 m 4
+5 2 n 4
+6 2 n 4
+
apply
(
func
, axis=0
, raw=False
, result_type=None
, args=()
, by_row='compat'
, engine='python'
, engine_kwargs=None
, **kwargs
)
Apply a function along an axis of the DataFrame.
Objects passed to the function are Series objects whose index is
+either the DataFrame's index (axis=0
) or the DataFrame's columns
+(axis=1
). By default (result_type=None
), the final return type
+is inferred from the return type of the applied function. Otherwise,
+it depends on the result_type
argument.
func
+(function)
+— Function to apply to each column or row.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Axis along which the function is applied:raw
+(bool, default False)
+— Determines if row or column is passed as a Series or ndarray object:False
: passes each row or column as a Series to the
+ function.True
: the passed function will receive ndarray objects
+ instead.
+ If you are just applying a NumPy reduction function this will
+ achieve much better performance.result_type
+({'expand', 'reduce', 'broadcast', None}, default None)
+— These only act when axis=1
(columns):args
+(tuple)
+— Positional arguments to pass to func
in addition to thearray/series.
+by_row
+(False or "compat", default "compat")
+— Only has an effect when func
is a listlike or dictlike of funcsand the func isn't a string.
+If "compat", will if possible first translate the func into pandas
+methods (e.g. Series().apply(np.sum)
will be translated to
+Series().sum()
). If that doesn't work, will try call to apply again with
+by_row=True
and if that fails, will call apply again with
+by_row=False
(backward compatible).
+If False, the funcs will be passed the whole Series at once.engine
+({'python', 'numba'}, default 'python')
+— Choose between the python (default) engine or the numba engine in apply.supported python features
+<https://numba.pydata.org/numba-doc/dev/reference/pysupported.html>
+and supported numpy features
+<https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>
+in numba to learn what you can or cannot use in the passed function.engine_kwargs
+(dict)
+— Pass keyword arguments to the engine.This is currently only used by the numba engine,
+see the documentation for the engine argument for more information.
+**kwargs
+
+— Additional keyword arguments to pass as keywords arguments tofunc
.
+Result of applying func
along the given axis of theDataFrame.
DataFrame.map: For elementwise operations.DataFrame.aggregate: Only perform aggregating type operations. +DataFrame.transform: Only perform transforming type operations.
+Notes
+Functions that mutate the passed object can produce unexpected
+behavior or errors and are not supported. See :ref:gotchas.udf-mutation
+for more details.
>>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])>>> df
+ A B
+0 4 9
+1 4 9
+2 4 9
+
Using a numpy universal function (in this case the same as
+np.sqrt(df)
):
>>> df.apply(np.sqrt)
+ A B
+0 2.0 3.0
+1 2.0 3.0
+2 2.0 3.0
+
Using a reducing function on either axis
+>>> df.apply(np.sum, axis=0)
+A 12
+B 27
+dtype: int64
+
>>> df.apply(np.sum, axis=1)
+0 13
+1 13
+2 13
+dtype: int64
+
Returning a list-like will result in a Series
+>>> df.apply(lambda x: [1, 2], axis=1)
+0 [1, 2]
+1 [1, 2]
+2 [1, 2]
+dtype: object
+
Passing result_type='expand'
will expand list-like results
+to columns of a Dataframe
>>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
+ 0 1
+0 1 2
+1 1 2
+2 1 2
+
Returning a Series inside the function is similar to passing
+result_type='expand'
. The resulting column names
+will be the Series index.
>>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
+ foo bar
+0 1 2
+1 1 2
+2 1 2
+
Passing result_type='broadcast'
will ensure the same shape
+result, whether list-like or scalar is returned by the function,
+and broadcast it along the axis. The resulting column names will
+be the originals.
>>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
+ A B
+0 1 2
+1 1 2
+2 1 2
+
map
(
func
, na_action=None
, **kwargs
)
Apply a function to a Dataframe elementwise.
.. versionadded:: 2.1.0
+DataFrame.applymap was deprecated and renamed to DataFrame.map.
+This method applies a function that accepts and returns a scalar +to every element of a DataFrame.
+func
+(callable)
+— Python function, returns a single value from a single value.na_action
+({None, 'ignore'}, default None)
+— If 'ignore', propagate NaN values, without passing them to func.**kwargs
+
+— Additional keyword arguments to pass as keywords arguments tofunc
.
+Transformed DataFrame.
DataFrame.apply : Apply a function along input axis of DataFrame.DataFrame.replace: Replace values given in to_replace
with value
.
+Series.map : Apply a function elementwise on a Series.
>>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])>>> df
+ 0 1
+0 1.000 2.120
+1 3.356 4.567
+
>>> df.map(lambda x: len(str(x)))
+ 0 1
+0 3 4
+1 5 5
+
Like Series.map, NA values can be ignored:
+>>> df_copy = df.copy()
+>>> df_copy.iloc[0, 0] = pd.NA
+>>> df_copy.map(lambda x: len(str(x)), na_action='ignore')
+ 0 1
+0 NaN 4
+1 5.0 5
+
It is also possible to use map
with functions that are not
+lambda
functions:
>>> df.map(round, ndigits=1)
+ 0 1
+0 1.0 2.1
+1 3.4 4.6
+
Note that a vectorized version of func
often exists, which will
+be much faster. You could square each number elementwise.
>>> df.map(lambda x: x**2)
+ 0 1
+0 1.000000 4.494400
+1 11.262736 20.857489
+
But it's better to avoid map in that case.
+>>> df ** 2
+ 0 1
+0 1.000000 4.494400
+1 11.262736 20.857489
+
applymap
(
func
, na_action=None
, **kwargs
)
Apply a function to a Dataframe elementwise.
.. deprecated:: 2.1.0
+DataFrame.applymap has been deprecated. Use DataFrame.map instead.
+This method applies a function that accepts and returns a scalar +to every element of a DataFrame.
+func
+(callable)
+— Python function, returns a single value from a single value.na_action
+({None, 'ignore'}, default None)
+— If 'ignore', propagate NaN values, without passing them to func.**kwargs
+
+— Additional keyword arguments to pass as keywords arguments tofunc
.
+Transformed DataFrame.
DataFrame.apply : Apply a function along input axis of DataFrame.DataFrame.map : Apply a function along input axis of DataFrame.
+DataFrame.replace: Replace values given in to_replace
with value
.
>>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])>>> df
+ 0 1
+0 1.000 2.120
+1 3.356 4.567
+
>>> df.map(lambda x: len(str(x)))
+ 0 1
+0 3 4
+1 5 5
+
join
(
other
, on=None
, how='left'
, lsuffix=''
, rsuffix=''
, sort=False
, validate=None
)
Join columns of another DataFrame.
Join columns with other
DataFrame either on index or on a key
+column. Efficiently join multiple DataFrame objects by index at once by
+passing a list.
other
+(DataFrame, Series, or a list containing any combination of them)
+— Index should be similar to one of the columns in this one. If aSeries is passed, its name attribute must be set, and that will be
+used as the column name in the resulting joined DataFrame.
+on
+(str, list of str, or array-like, optional)
+— Column or index level name(s) in the caller to join on the indexin other
, otherwise joins index-on-index. If multiple
+values given, the other
DataFrame must have a MultiIndex. Can
+pass an array as the join key if it is not already contained in
+the calling DataFrame. Like an Excel VLOOKUP operation.
+how
+({'left', 'right', 'outer', 'inner', 'cross'}, default 'left')
+— How to handle the operation of the two objects.other
's index.other
's index, and sort it lexicographically.other
's index, preserving the order
+ of the calling's one.lsuffix
+(str, default '')
+— Suffix to use from left frame's overlapping columns.rsuffix
+(str, default '')
+— Suffix to use from right frame's overlapping columns.sort
+(bool, default False)
+— Order result DataFrame lexicographically by the join key. If False,the order of the join key depends on the join type (how keyword).
+validate
+(str, optional)
+— If specified, checks if join is of specified type.A dataframe containing columns from both the caller and other
.
DataFrame.merge : For column(s)-on-column(s) operations.
Notes
+Parameters on
, lsuffix
, and rsuffix
are not supported when
+passing a list of DataFrame
objects.
>>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
+
>>> df
+ key A
+0 K0 A0
+1 K1 A1
+2 K2 A2
+3 K3 A3
+4 K4 A4
+5 K5 A5
+
>>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
+... 'B': ['B0', 'B1', 'B2']})
+
>>> other
+ key B
+0 K0 B0
+1 K1 B1
+2 K2 B2
+
Join DataFrames using their indexes.
+>>> df.join(other, lsuffix='_caller', rsuffix='_other')
+ key_caller A key_other B
+0 K0 A0 K0 B0
+1 K1 A1 K1 B1
+2 K2 A2 K2 B2
+3 K3 A3 NaN NaN
+4 K4 A4 NaN NaN
+5 K5 A5 NaN NaN
+
If we want to join using the key columns, we need to set key to be
+the index in both df
and other
. The joined DataFrame will have
+key as its index.
>>> df.set_index('key').join(other.set_index('key'))
+ A B
+key
+K0 A0 B0
+K1 A1 B1
+K2 A2 B2
+K3 A3 NaN
+K4 A4 NaN
+K5 A5 NaN
+
Another option to join using the key columns is to use the on
+parameter. DataFrame.join always uses other
's index but we can use
+any column in df
. This method preserves the original DataFrame's
+index in the result.
>>> df.join(other.set_index('key'), on='key')
+ key A B
+0 K0 A0 B0
+1 K1 A1 B1
+2 K2 A2 B2
+3 K3 A3 NaN
+4 K4 A4 NaN
+5 K5 A5 NaN
+
Using non-unique key values shows how they are matched.
+>>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
+... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
+
>>> df
+ key A
+0 K0 A0
+1 K1 A1
+2 K1 A2
+3 K3 A3
+4 K0 A4
+5 K1 A5
+
>>> df.join(other.set_index('key'), on='key', validate='m:1')
+ key A B
+0 K0 A0 B0
+1 K1 A1 B1
+2 K1 A2 B1
+3 K3 A3 NaN
+4 K0 A4 B0
+5 K1 A5 B1
+
merge
(
right
, how='inner'
, on=None
, left_on=None
, right_on=None
, left_index=False
, right_index=False
, sort=False
, suffixes=('_x', '_y')
, copy=None
, indicator=False
, validate=None
)
Merge DataFrame or named Series objects with a database-style join.
A named Series object is treated as a DataFrame with a single named column.
+The join is done on columns or indexes. If joining columns on +columns, the DataFrame indexes will be ignored. Otherwise if joining indexes +on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed.
+.. warning::
+If both key columns contain rows where the key is a null value, those
+rows will be matched against each other. This is different from usual SQL
+join behaviour and can lead to unexpected results.
+
+right
+(DataFrame or named Series)
+— Object to merge with.how
+({'left', 'right', 'outer', 'inner', 'cross'}, default 'inner')
+— Type of merge to be performed.on
+(label or list)
+— Column or index level names to join on. These must be found in bothDataFrames. If on
is None and not merging on indexes then this defaults
+to the intersection of the columns in both DataFrames.
+left_on
+(label or list, or array-like)
+— Column or index level names to join on in the left DataFrame. Can alsobe an array or list of arrays of the length of the left DataFrame.
+These arrays are treated as if they are columns.
+right_on
+(label or list, or array-like)
+— Column or index level names to join on in the right DataFrame. Can alsobe an array or list of arrays of the length of the right DataFrame.
+These arrays are treated as if they are columns.
+left_index
+(bool, default False)
+— Use the index from the left DataFrame as the join key(s). If it is aMultiIndex, the number of keys in the other DataFrame (either the index
+or a number of columns) must match the number of levels.
+right_index
+(bool, default False)
+— Use the index from the right DataFrame as the join key. Same caveats asleft_index.
+sort
+(bool, default False)
+— Sort the join keys lexicographically in the result DataFrame. If False,the order of the join keys depends on the join type (how keyword).
+suffixes
+(list-like, default is ("_x", "_y"))
+— A length-2 sequence where each element is optionally a stringindicating the suffix to add to overlapping column names in
+left
and right
respectively. Pass a value of None
instead
+of a string to indicate that the column name from left
or
+right
should be left as-is, with no suffix. At least one of the
+values must not be None.
+copy
+(bool, default True)
+— If False, avoid copy if possible.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+indicator
+(bool or str, default False)
+— If True, adds a column to the output DataFrame called "_merge" withinformation on the source of each row. The column can be given a different
+name by providing a string argument. The column will have a Categorical
+type with the value of "left_only" for observations whose merge key only
+appears in the left DataFrame, "right_only" for observations
+whose merge key only appears in the right DataFrame, and "both"
+if the observation's merge key is found in both DataFrames.
+validate
+(str, optional)
+— If specified, checks if merge is of specified type.A DataFrame of the two merged objects.
merge_ordered : Merge with optional filling/interpolation.merge_asof : Merge on nearest keys. +DataFrame.join : Similar method using indices.
+>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],... 'value': [1, 2, 3, 5]})
+>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
+... 'value': [5, 6, 7, 8]})
+>>> df1
+ lkey value
+0 foo 1
+1 bar 2
+2 baz 3
+3 foo 5
+>>> df2
+ rkey value
+0 foo 5
+1 bar 6
+2 baz 7
+3 foo 8
+
Merge df1 and df2 on the lkey and rkey columns. The value columns have +the default suffixes, _x and _y, appended.
+>>> df1.merge(df2, left_on='lkey', right_on='rkey')
+ lkey value_x rkey value_y
+0 foo 1 foo 5
+1 foo 1 foo 8
+2 bar 2 bar 6
+3 baz 3 baz 7
+4 foo 5 foo 5
+5 foo 5 foo 8
+
Merge DataFrames df1 and df2 with specified left and right suffixes +appended to any overlapping columns.
+>>> df1.merge(df2, left_on='lkey', right_on='rkey',
+... suffixes=('_left', '_right'))
+ lkey value_left rkey value_right
+0 foo 1 foo 5
+1 foo 1 foo 8
+2 bar 2 bar 6
+3 baz 3 baz 7
+4 foo 5 foo 5
+5 foo 5 foo 8
+
Merge DataFrames df1 and df2, but raise an exception if the DataFrames have +any overlapping columns.
+>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
+Traceback (most recent call last):
+...
+ValueError: columns overlap but no suffix specified:
+ Index(['value'], dtype='object')
+
>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
+>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
+>>> df1
+ a b
+0 foo 1
+1 bar 2
+>>> df2
+ a c
+0 foo 3
+1 baz 4
+
>>> df1.merge(df2, how='inner', on='a')
+ a b c
+0 foo 1 3
+
>>> df1.merge(df2, how='left', on='a')
+ a b c
+0 foo 1 3.0
+1 bar 2 NaN
+
>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
+>>> df2 = pd.DataFrame({'right': [7, 8]})
+>>> df1
+ left
+0 foo
+1 bar
+>>> df2
+ right
+0 7
+1 8
+
>>> df1.merge(df2, how='cross')
+ left right
+0 foo 7
+1 foo 8
+2 bar 7
+3 bar 8
+
round
(
decimals=0
, *args
, **kwargs
)
Round a DataFrame to a variable number of decimal places.
decimals
+(int, dict, Series)
+— Number of decimal places to round each column to. If an int isgiven, round each column to the same number of places.
+Otherwise dict and Series round to variable numbers of places.
+Column names should be in the keys if decimals
is a
+dict-like, or in the index if decimals
is a Series. Any
+columns not included in decimals
will be left as is. Elements
+of decimals
which are not columns of the input will be
+ignored.
+*args
+
+— Additional keywords have no effect but might be accepted forcompatibility with numpy.
+**kwargs
+
+— Additional keywords have no effect but might be accepted forcompatibility with numpy.
+A DataFrame with the affected columns rounded to the specifiednumber of decimal places.
+numpy.around : Round a numpy array to the given number of decimals.Series.round : Round a Series to the given number of decimals.
+>>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],... columns=['dogs', 'cats'])
+>>> df
+ dogs cats
+0 0.21 0.32
+1 0.01 0.67
+2 0.66 0.03
+3 0.21 0.18
+
By providing an integer each column is rounded to the same number +of decimal places
+>>> df.round(1)
+ dogs cats
+0 0.2 0.3
+1 0.0 0.7
+2 0.7 0.0
+3 0.2 0.2
+
With a dict, the number of places for specific columns can be +specified with the column names as key and the number of decimal +places as value
+>>> df.round({'dogs': 1, 'cats': 0})
+ dogs cats
+0 0.2 0.0
+1 0.0 1.0
+2 0.7 0.0
+3 0.2 0.0
+
Using a Series, the number of places for specific columns can be +specified with the column names as index and the number of +decimal places as value
+>>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
+>>> df.round(decimals)
+ dogs cats
+0 0.2 0.0
+1 0.0 1.0
+2 0.7 0.0
+3 0.2 0.0
+
corr
(
method='pearson'
, min_periods=1
, numeric_only=False
)
Compute pairwise correlation of columns, excluding NA/null values.
method
+({'pearson', 'kendall', 'spearman'} or callable)
+— Method of correlation:min_periods
+(int, optional)
+— Minimum number of observations required per pair of columnsto have a valid result. Currently only available for Pearson
+and Spearman correlation.
+numeric_only
+(bool, default False)
+— Include only float
, int
or boolean
data.numeric_only
is now False
.
+Correlation matrix.
DataFrame.corrwith : Compute pairwise correlation with another DataFrame or Series. +Series.corr : Compute the correlation between two Series.
+Notes
+Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
+Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>
_Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>
_Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>
_>>> def histogram_intersection(a, b):... v = np.minimum(a, b).sum().round(decimals=1)
+... return v
+>>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
+... columns=['dogs', 'cats'])
+>>> df.corr(method=histogram_intersection)
+ dogs cats
+dogs 1.0 0.3
+cats 0.3 1.0
+
>>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],
+... columns=['dogs', 'cats'])
+>>> df.corr(min_periods=3)
+ dogs cats
+dogs 1.0 NaN
+cats NaN 1.0
+
cov
(
min_periods=None
, ddof=1
, numeric_only=False
)
Compute pairwise covariance of columns, excluding NA/null values.
Compute the pairwise covariance among the series of a DataFrame.
+The returned data frame is the covariance matrix
+<https://en.wikipedia.org/wiki/Covariance_matrix>
__ of the columns
+of the DataFrame.
Both NA and null values are automatically excluded from the
+calculation. (See the note below about bias from missing values.)
+A threshold can be set for the minimum number of
+observations for each value created. Comparisons with observations
+below this threshold will be returned as NaN
.
This method is generally used for the analysis of time series data to +understand the relationship between different measures +across time.
+min_periods
+(int, optional)
+— Minimum number of observations required per pair of columnsto have a valid result.
+ddof
+(int, default 1)
+— Delta degrees of freedom. The divisor used in calculationsis N - ddof
, where N
represents the number of elements.
+This argument is applicable only when no nan
is in the dataframe.
+numeric_only
+(bool, default False)
+— Include only float
, int
or boolean
data.numeric_only
is now False
.
+The covariance matrix of the series of the DataFrame.
Series.cov : Compute covariance with another Series.core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample + covariance. +core.window.expanding.Expanding.cov : Expanding sample covariance. +core.window.rolling.Rolling.cov : Rolling sample covariance.
+Notes
+Returns the covariance matrix of the DataFrame's time series. +The covariance is normalized by N-ddof.
+For DataFrames that have Series that are missing data (assuming that
+data is missing at random
+<https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>
__)
+the returned covariance matrix will be an unbiased estimate
+of the variance and covariance between the member Series.
However, for many applications this estimate may not be acceptable
+because the estimate covariance matrix is not guaranteed to be positive
+semi-definite. This could lead to estimate correlations having
+absolute values which are greater than one, and/or a non-invertible
+covariance matrix. See Estimation of covariance matrices
+<https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
+matrices>
__ for more details.
>>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],... columns=['dogs', 'cats'])
+>>> df.cov()
+ dogs cats
+dogs 0.666667 -1.000000
+cats -1.000000 1.666667
+
>>> np.random.seed(42)
+>>> df = pd.DataFrame(np.random.randn(1000, 5),
+... columns=['a', 'b', 'c', 'd', 'e'])
+>>> df.cov()
+ a b c d e
+a 0.998438 -0.020161 0.059277 -0.008943 0.014144
+b -0.020161 1.059352 -0.008543 -0.024738 0.009826
+c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
+d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
+e 0.014144 0.009826 -0.000271 -0.013692 0.977795
+
Minimum number of periods
+This method also supports an optional min_periods
keyword
+that specifies the required minimum number of non-NA observations for
+each column pair in order to have a valid result:
>>> np.random.seed(42)
+>>> df = pd.DataFrame(np.random.randn(20, 3),
+... columns=['a', 'b', 'c'])
+>>> df.loc[df.index[:5], 'a'] = np.nan
+>>> df.loc[df.index[5:10], 'b'] = np.nan
+>>> df.cov(min_periods=12)
+ a b c
+a 0.316741 NaN -0.150812
+b NaN 1.248003 0.191417
+c -0.150812 0.191417 0.895202
+
corrwith
(
other
, axis=0
, drop=False
, method='pearson'
, numeric_only=False
)
Compute pairwise correlation.
Pairwise correlation is computed between rows or columns of +DataFrame with rows or columns of Series or DataFrame. DataFrames +are first aligned along both axes before computing the +correlations.
+other
+(DataFrame, Series)
+— Object with which to compute correlations.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' forcolumn-wise.
+drop
+(bool, default False)
+— Drop missing indices from result.method
+({'pearson', 'kendall', 'spearman'} or callable)
+— Method of correlation:numeric_only
+(bool, default False)
+— Include only float
, int
or boolean
data.numeric_only
is now False
.
+Pairwise correlations.
DataFrame.corr : Compute pairwise correlation of columns.
>>> index = ["a", "b", "c", "d", "e"]>>> columns = ["one", "two", "three", "four"]
+>>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)
+>>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)
+>>> df1.corrwith(df2)
+one 1.0
+two 1.0
+three 1.0
+four 1.0
+dtype: float64
+
>>> df2.corrwith(df1, axis=1)
+a 1.0
+b 1.0
+c 1.0
+d 1.0
+e NaN
+dtype: float64
+
count
(
axis=0
, numeric_only=False
)
Count non-NA cells for each column or row.
The values None
, NaN
, NaT
, pandas.NA
are considered NA.
axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— If 0 or 'index' counts are generated for each column.If 1 or 'columns' counts are generated for each row.
+numeric_only
+(bool, default False)
+— Include only float
, int
or boolean
data.For each column/row the number of non-NA/null entries.
Series.count: Number of non-NA elements in a Series.DataFrame.value_counts: Count unique combinations of columns. +DataFrame.shape: Number of DataFrame rows and columns (including NA + elements). +DataFrame.isna: Boolean same-sized DataFrame showing places of NA + elements.
+Constructing DataFrame from a dictionary:
>>> df = pd.DataFrame({"Person":
+... ["John", "Myla", "Lewis", "John", "Myla"],
+... "Age": [24., np.nan, 21., 33, 26],
+... "Single": [False, True, True, True, False]})
+>>> df
+ Person Age Single
+0 John 24.0 False
+1 Myla NaN True
+2 Lewis 21.0 True
+3 John 33.0 True
+4 Myla 26.0 False
+
Notice the uncounted NA values:
+>>> df.count()
+Person 5
+Age 4
+Single 5
+dtype: int64
+
Counts for each row:
+>>> df.count(axis='columns')
+0 3
+1 2
+2 3
+3 3
+4 3
+dtype: int64
+
any
(
axis=0
, bool_only=False
, skipna=True
, **kwargs
)
Return whether any element is True, potentially over an axis.
Returns False unless there is at least one element within a series or +along a Dataframe axis that is True or equivalent (e.g. non-zero or +non-empty).
+axis
+({0 or 'index', 1 or 'columns', None}, default 0)
+— Indicate which axis or axes should be reduced. For Series
this parameteris unused and defaults to 0.bool_only
+(bool, default False)
+— Include only boolean columns. Not implemented for Series.skipna
+(bool, default True)
+— Exclude NA/null values. If the entire row/column is NA and skipna isTrue, then the result will be False, as for an empty row/column.
+If skipna is False, then NA are treated as True, because these are not
+equal to zero.
+**kwargs
+(any, default None)
+— Additional keywords have no effect but might be accepted forcompatibility with NumPy.
+If level is specified, then, DataFrame is returned; otherwise, Seriesis returned.
+numpy.any : Numpy version of this method.Series.any : Return whether any element is True. +Series.all : Return whether all elements are True. +DataFrame.any : Return whether any element is True over requested axis. +DataFrame.all : Return whether all elements are True over requested axis.
+Series
For Series input, the output is a scalar indicating whether any element +is True.
+>>> pd.Series([False, False]).any()
+False
+>>> pd.Series([True, False]).any()
+True
+>>> pd.Series([], dtype="float64").any()
+False
+>>> pd.Series([np.nan]).any()
+False
+>>> pd.Series([np.nan]).any(skipna=False)
+True
+
DataFrame
+Whether each column contains at least one True element (the default).
+>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
+>>> df
+ A B C
+0 1 0 0
+1 2 2 0
+
>>> df.any()
+A True
+B True
+C False
+dtype: bool
+
Aggregating over the columns.
+>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
+>>> df
+ A B
+0 True 1
+1 False 2
+
>>> df.any(axis='columns')
+0 True
+1 True
+dtype: bool
+
>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
+>>> df
+ A B
+0 True 1
+1 False 0
+
>>> df.any(axis='columns')
+0 True
+1 False
+dtype: bool
+
Aggregating over the entire DataFrame with axis=None
.
>>> df.any(axis=None)
+True
+
any
for an empty DataFrame is an empty Series.
>>> pd.DataFrame([]).any()
+Series([], dtype: bool)
+
all
(
axis=0
, bool_only=False
, skipna=True
, **kwargs
)
Return whether all elements are True, potentially over an axis.
Returns True unless there at least one element within a series or +along a Dataframe axis that is False or equivalent (e.g. zero or +empty).
+axis
+({0 or 'index', 1 or 'columns', None}, default 0)
+— Indicate which axis or axes should be reduced. For Series
this parameteris unused and defaults to 0.bool_only
+(bool, default False)
+— Include only boolean columns. Not implemented for Series.skipna
+(bool, default True)
+— Exclude NA/null values. If the entire row/column is NA and skipna isTrue, then the result will be True, as for an empty row/column.
+If skipna is False, then NA are treated as True, because these are not
+equal to zero.
+**kwargs
+(any, default None)
+— Additional keywords have no effect but might be accepted forcompatibility with NumPy.
+If level is specified, then, DataFrame is returned; otherwise, Seriesis returned.
+Series.all : Return True if all elements are True.DataFrame.any : Return True if one (or more) elements are True.
+Series
>>> pd.Series([True, True]).all()
+True
+>>> pd.Series([True, False]).all()
+False
+>>> pd.Series([], dtype="float64").all()
+True
+>>> pd.Series([np.nan]).all()
+True
+>>> pd.Series([np.nan]).all(skipna=False)
+True
+
DataFrames
+Create a dataframe from a dictionary.
+>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
+>>> df
+ col1 col2
+0 True True
+1 True False
+
Default behaviour checks if values in each column all return True.
+>>> df.all()
+col1 True
+col2 False
+dtype: bool
+
Specify axis='columns'
to check if values in each row all return True.
>>> df.all(axis='columns')
+0 True
+1 False
+dtype: bool
+
Or axis=None
for whether every value is True.
>>> df.all(axis=None)
+False
+
min
(
axis=0
, skipna=True
, numeric_only=False
, **kwargs
)
Return the minimum of the values over the requested axis.
If you want the index of the minimum, use idxmin
. This is the equivalent of the numpy.ndarray
method argmin
.
axis
+({index (0), columns (1)})
+— Axis for the function to be applied on.For Series
this parameter is unused and defaults to 0.axis=None
will apply the aggregation
+across both axes.skipna
+(bool, default True)
+— Exclude NA/null values when computing the result.numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.**kwargs
+
+— Additional keyword arguments to be passed to the function.Series.sum : Return the sum.Series.min : Return the minimum. +Series.max : Return the maximum. +Series.idxmin : Return the index of the minimum. +Series.idxmax : Return the index of the maximum. +DataFrame.sum : Return the sum over the requested axis. +DataFrame.min : Return the minimum over the requested axis. +DataFrame.max : Return the maximum over the requested axis. +DataFrame.idxmin : Return the index of the minimum over the requested axis. +DataFrame.idxmax : Return the index of the maximum over the requested axis.
+>>> idx = pd.MultiIndex.from_arrays([... ['warm', 'warm', 'cold', 'cold'],
+... ['dog', 'falcon', 'fish', 'spider']],
+... names=['blooded', 'animal'])
+>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
+>>> s
+blooded animal
+warm dog 4
+ falcon 2
+cold fish 0
+ spider 8
+Name: legs, dtype: int64
+
>>> s.min()
+0
+
max
(
axis=0
, skipna=True
, numeric_only=False
, **kwargs
)
Return the maximum of the values over the requested axis.
If you want the index of the maximum, use idxmax
. This is the equivalent of the numpy.ndarray
method argmax
.
axis
+({index (0), columns (1)})
+— Axis for the function to be applied on.For Series
this parameter is unused and defaults to 0.axis=None
will apply the aggregation
+across both axes.skipna
+(bool, default True)
+— Exclude NA/null values when computing the result.numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.**kwargs
+
+— Additional keyword arguments to be passed to the function.Series.sum : Return the sum.Series.min : Return the minimum. +Series.max : Return the maximum. +Series.idxmin : Return the index of the minimum. +Series.idxmax : Return the index of the maximum. +DataFrame.sum : Return the sum over the requested axis. +DataFrame.min : Return the minimum over the requested axis. +DataFrame.max : Return the maximum over the requested axis. +DataFrame.idxmin : Return the index of the minimum over the requested axis. +DataFrame.idxmax : Return the index of the maximum over the requested axis.
+>>> idx = pd.MultiIndex.from_arrays([... ['warm', 'warm', 'cold', 'cold'],
+... ['dog', 'falcon', 'fish', 'spider']],
+... names=['blooded', 'animal'])
+>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
+>>> s
+blooded animal
+warm dog 4
+ falcon 2
+cold fish 0
+ spider 8
+Name: legs, dtype: int64
+
>>> s.max()
+8
+
sum
(
axis=0
, skipna=True
, numeric_only=False
, min_count=0
, **kwargs
)
Return the sum of the values over the requested axis.
This is equivalent to the method numpy.sum
.
axis
+({index (0), columns (1)})
+— Axis for the function to be applied on.For Series
this parameter is unused and defaults to 0.The behavior of DataFrame.sum with ``axis=None`` is deprecated,
+in a future version this will reduce over both axes and return a scalar
+To retain the old behavior, pass axis=0 (or do not pass axis).
+
+.. versionadded:: 2.0.0
+skipna
+(bool, default True)
+— Exclude NA/null values when computing the result.numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.min_count
+(int, default 0)
+— The required number of valid values to perform the operation. If fewer thanmin_count
non-NA values are present the result will be NA.
+**kwargs
+
+— Additional keyword arguments to be passed to the function.Series.sum : Return the sum.Series.min : Return the minimum. +Series.max : Return the maximum. +Series.idxmin : Return the index of the minimum. +Series.idxmax : Return the index of the maximum. +DataFrame.sum : Return the sum over the requested axis. +DataFrame.min : Return the minimum over the requested axis. +DataFrame.max : Return the maximum over the requested axis. +DataFrame.idxmin : Return the index of the minimum over the requested axis. +DataFrame.idxmax : Return the index of the maximum over the requested axis.
+>>> idx = pd.MultiIndex.from_arrays([... ['warm', 'warm', 'cold', 'cold'],
+... ['dog', 'falcon', 'fish', 'spider']],
+... names=['blooded', 'animal'])
+>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
+>>> s
+blooded animal
+warm dog 4
+ falcon 2
+cold fish 0
+ spider 8
+Name: legs, dtype: int64
+
>>> s.sum()
+14
+
By default, the sum of an empty or all-NA Series is 0
.
>>> pd.Series([], dtype="float64").sum() # min_count=0 is the default
+0.0
+
This can be controlled with the min_count
parameter. For example, if
+you'd like the sum of an empty series to be NaN, pass min_count=1
.
>>> pd.Series([], dtype="float64").sum(min_count=1)
+nan
+
Thanks to the skipna
parameter, min_count
handles all-NA and
+empty series identically.
>>> pd.Series([np.nan]).sum()
+0.0
+
>>> pd.Series([np.nan]).sum(min_count=1)
+nan
+
prod
(
axis=0
, skipna=True
, numeric_only=False
, min_count=0
, **kwargs
)
Return the product of the values over the requested axis.
axis
+({index (0), columns (1)})
+— Axis for the function to be applied on.For Series
this parameter is unused and defaults to 0.The behavior of DataFrame.prod with ``axis=None`` is deprecated,
+in a future version this will reduce over both axes and return a scalar
+To retain the old behavior, pass axis=0 (or do not pass axis).
+
+.. versionadded:: 2.0.0
+skipna
+(bool, default True)
+— Exclude NA/null values when computing the result.numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.min_count
+(int, default 0)
+— The required number of valid values to perform the operation. If fewer thanmin_count
non-NA values are present the result will be NA.
+**kwargs
+
+— Additional keyword arguments to be passed to the function.Series.sum : Return the sum.Series.min : Return the minimum. +Series.max : Return the maximum. +Series.idxmin : Return the index of the minimum. +Series.idxmax : Return the index of the maximum. +DataFrame.sum : Return the sum over the requested axis. +DataFrame.min : Return the minimum over the requested axis. +DataFrame.max : Return the maximum over the requested axis. +DataFrame.idxmin : Return the index of the minimum over the requested axis. +DataFrame.idxmax : Return the index of the maximum over the requested axis.
+By default, the product of an empty or all-NA Series is 1
>>> pd.Series([], dtype="float64").prod()
+1.0
+
This can be controlled with the min_count
parameter
>>> pd.Series([], dtype="float64").prod(min_count=1)
+nan
+
Thanks to the skipna
parameter, min_count
handles all-NA and
+empty series identically.
>>> pd.Series([np.nan]).prod()
+1.0
+
>>> pd.Series([np.nan]).prod(min_count=1)
+nan
+
mean
(
axis=0
, skipna=True
, numeric_only=False
, **kwargs
)
Return the mean of the values over the requested axis.
axis
+({index (0), columns (1)})
+— Axis for the function to be applied on.For Series
this parameter is unused and defaults to 0.axis=None
will apply the aggregation
+across both axes.skipna
+(bool, default True)
+— Exclude NA/null values when computing the result.numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.**kwargs
+
+— Additional keyword arguments to be passed to the function.) +) +0
+e
+) +f +b +2 +3 +) +5 +5 +4
+1
+) +5 +5 +4
+d +.
+, +) +) +5 +4
+median
(
axis=0
, skipna=True
, numeric_only=False
, **kwargs
)
Return the median of the values over the requested axis.
axis
+({index (0), columns (1)})
+— Axis for the function to be applied on.For Series
this parameter is unused and defaults to 0.axis=None
will apply the aggregation
+across both axes.skipna
+(bool, default True)
+— Exclude NA/null values when computing the result.numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.**kwargs
+
+— Additional keyword arguments to be passed to the function.) +) +0
+e
+) +f +b +2 +3 +) +5 +5 +4
+1
+) +5 +5 +4
+` +.
+, +) +) +5 +4
+sem
(
axis=0
, skipna=True
, ddof=1
, numeric_only=False
, **kwargs
)
Return unbiased standard error of the mean over requested axis.
Normalized by N-1 by default. This can be changed using the ddof argument
+axis
+({index (0), columns (1)})
+— For Series
this parameter is unused and defaults to 0.The behavior of DataFrame.sem with ``axis=None`` is deprecated,
+in a future version this will reduce over both axes and return a scalar
+To retain the old behavior, pass axis=0 (or do not pass axis).
+
+skipna
+(bool, default True)
+— Exclude NA/null values. If an entire row/column is NA, the resultwill be NA.
+ddof
+(int, default 1)
+— Delta Degrees of Freedom. The divisor used in calculations is N - ddof,where N represents the number of elements.
+numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.) +) +5
+e
+) +f +b +2 +3 +) +5 +5 +4
+1
+) +5 +5 +4
+` +.
+, +) +) +5 +4
+var
(
axis=0
, skipna=True
, ddof=1
, numeric_only=False
, **kwargs
)
Return unbiased variance over requested axis.
Normalized by N-1 by default. This can be changed using the ddof argument.
+axis
+({index (0), columns (1)})
+— For Series
this parameter is unused and defaults to 0.The behavior of DataFrame.var with ``axis=None`` is deprecated,
+in a future version this will reduce over both axes and return a scalar
+To retain the old behavior, pass axis=0 (or do not pass axis).
+
+skipna
+(bool, default True)
+— Exclude NA/null values. If an entire row/column is NA, the resultwill be NA.
+ddof
+(int, default 1)
+— Delta Degrees of Freedom. The divisor used in calculations is N - ddof,where N represents the number of elements.
+numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],... 'age': [21, 25, 62, 43],
+... 'height': [1.61, 1.87, 1.49, 2.01]}
+... ).set_index('person_id')
+>>> df
+ age height
+person_id
+0 21 1.61
+1 25 1.87
+2 62 1.49
+3 43 2.01
+
>>> df.var()
+age 352.916667
+height 0.056367
+dtype: float64
+
Alternatively, ddof=0
can be set to normalize by N instead of N-1:
>>> df.var(ddof=0)
+age 264.687500
+height 0.042275
+dtype: float64
+
std
(
axis=0
, skipna=True
, ddof=1
, numeric_only=False
, **kwargs
)
Return sample standard deviation over requested axis.
Normalized by N-1 by default. This can be changed using the ddof argument.
+axis
+({index (0), columns (1)})
+— For Series
this parameter is unused and defaults to 0.The behavior of DataFrame.std with ``axis=None`` is deprecated,
+in a future version this will reduce over both axes and return a scalar
+To retain the old behavior, pass axis=0 (or do not pass axis).
+
+skipna
+(bool, default True)
+— Exclude NA/null values. If an entire row/column is NA, the resultwill be NA.
+ddof
+(int, default 1)
+— Delta Degrees of Freedom. The divisor used in calculations is N - ddof,where N represents the number of elements.
+numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.Notes
+To have the same behaviour as numpy.std
, use ddof=0
(instead of the
+default ddof=1
)
>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],... 'age': [21, 25, 62, 43],
+... 'height': [1.61, 1.87, 1.49, 2.01]}
+... ).set_index('person_id')
+>>> df
+ age height
+person_id
+0 21 1.61
+1 25 1.87
+2 62 1.49
+3 43 2.01
+
The standard deviation of the columns can be found as follows:
+>>> df.std()
+age 18.786076
+height 0.237417
+dtype: float64
+
Alternatively, ddof=0
can be set to normalize by N instead of N-1:
>>> df.std(ddof=0)
+age 16.269219
+height 0.205609
+dtype: float64
+
skew
(
axis=0
, skipna=True
, numeric_only=False
, **kwargs
)
Return unbiased skew over requested axis.
Normalized by N-1.
+axis
+({index (0), columns (1)})
+— Axis for the function to be applied on.For Series
this parameter is unused and defaults to 0.axis=None
will apply the aggregation
+across both axes.skipna
+(bool, default True)
+— Exclude NA/null values when computing the result.numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.**kwargs
+
+— Additional keyword arguments to be passed to the function.) +) +0
+e
+, +) +f +c +1 +3 +5 +) +0 +0 +0 +4
+1
+) +1 +1 +0 +4
+d +.
+, +) +) +0 +4
+kurt
(
axis=0
, skipna=True
, numeric_only=False
, **kwargs
)
Return unbiased kurtosis over requested axis.
Kurtosis obtained using Fisher's definition of +kurtosis (kurtosis of normal == 0.0). Normalized by N-1.
+axis
+({index (0), columns (1)})
+— Axis for the function to be applied on.For Series
this parameter is unused and defaults to 0.axis=None
will apply the aggregation
+across both axes.skipna
+(bool, default True)
+— Exclude NA/null values when computing the result.numeric_only
+(bool, default False)
+— Include only float, int, boolean columns. Not implemented for Series.**kwargs
+
+— Additional keyword arguments to be passed to the function.) +s +1 +2 +2 +3 +4 +) +5
+e
+, +) +f +b +3 +4 +4 +4 +) +5 +0 +4
+e
+) +3
+1
+, +) +) +0 +0 +4
+cummin
(
axis=None
, skipna=True
, *args
, **kwargs
)
Return cumulative minimum over a DataFrame or Series axis.
Returns a DataFrame or Series of the same size containing the cumulative +minimum.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The index or the name of the axis. 0 is equivalent to None or 'index'.For Series
this parameter is unused and defaults to 0.
+skipna
+(bool, default True)
+— Exclude NA/null values. If an entire row/column is NA, the resultwill be NA.
+Return cumulative minimum of Series or DataFrame.
core.window.expanding.Expanding.min : Similar functionality but ignores NaN
values.
+DataFrame.min : Return the minimum over
+ DataFrame axis.
+DataFrame.cummax : Return cumulative maximum over DataFrame axis.
+DataFrame.cummin : Return cumulative minimum over DataFrame axis.
+DataFrame.cumsum : Return cumulative sum over DataFrame axis.
+DataFrame.cumprod : Return cumulative product over DataFrame axis.
Series
>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0 2.0
+1 NaN
+2 5.0
+3 -1.0
+4 0.0
+dtype: float64
+
By default, NA values are ignored.
+>>> s.cummin()
+0 2.0
+1 NaN
+2 2.0
+3 -1.0
+4 -1.0
+dtype: float64
+
To include NA values in the operation, use skipna=False
>>> s.cummin(skipna=False)
+0 2.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
DataFrame
+>>> df = pd.DataFrame([[2.0, 1.0],
+... [3.0, np.nan],
+... [1.0, 0.0]],
+... columns=list('AB'))
+>>> df
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+
By default, iterates over rows and finds the minimum
+in each column. This is equivalent to axis=None
or axis='index'
.
>>> df.cummin()
+ A B
+0 2.0 1.0
+1 2.0 NaN
+2 1.0 0.0
+
To iterate over columns and find the minimum in each row,
+use axis=1
>>> df.cummin(axis=1)
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+
cummax
(
axis=None
, skipna=True
, *args
, **kwargs
)
Return cumulative maximum over a DataFrame or Series axis.
Returns a DataFrame or Series of the same size containing the cumulative +maximum.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The index or the name of the axis. 0 is equivalent to None or 'index'.For Series
this parameter is unused and defaults to 0.
+skipna
+(bool, default True)
+— Exclude NA/null values. If an entire row/column is NA, the resultwill be NA.
+Return cumulative maximum of Series or DataFrame.
core.window.expanding.Expanding.max : Similar functionality but ignores NaN
values.
+DataFrame.max : Return the maximum over
+ DataFrame axis.
+DataFrame.cummax : Return cumulative maximum over DataFrame axis.
+DataFrame.cummin : Return cumulative minimum over DataFrame axis.
+DataFrame.cumsum : Return cumulative sum over DataFrame axis.
+DataFrame.cumprod : Return cumulative product over DataFrame axis.
Series
>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0 2.0
+1 NaN
+2 5.0
+3 -1.0
+4 0.0
+dtype: float64
+
By default, NA values are ignored.
+>>> s.cummax()
+0 2.0
+1 NaN
+2 5.0
+3 5.0
+4 5.0
+dtype: float64
+
To include NA values in the operation, use skipna=False
>>> s.cummax(skipna=False)
+0 2.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
DataFrame
+>>> df = pd.DataFrame([[2.0, 1.0],
+... [3.0, np.nan],
+... [1.0, 0.0]],
+... columns=list('AB'))
+>>> df
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+
By default, iterates over rows and finds the maximum
+in each column. This is equivalent to axis=None
or axis='index'
.
>>> df.cummax()
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 3.0 1.0
+
To iterate over columns and find the maximum in each row,
+use axis=1
>>> df.cummax(axis=1)
+ A B
+0 2.0 2.0
+1 3.0 NaN
+2 1.0 1.0
+
cumsum
(
axis=None
, skipna=True
, *args
, **kwargs
)
Return cumulative sum over a DataFrame or Series axis.
Returns a DataFrame or Series of the same size containing the cumulative +sum.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The index or the name of the axis. 0 is equivalent to None or 'index'.For Series
this parameter is unused and defaults to 0.
+skipna
+(bool, default True)
+— Exclude NA/null values. If an entire row/column is NA, the resultwill be NA.
+Return cumulative sum of Series or DataFrame.
core.window.expanding.Expanding.sum : Similar functionality but ignores NaN
values.
+DataFrame.sum : Return the sum over
+ DataFrame axis.
+DataFrame.cummax : Return cumulative maximum over DataFrame axis.
+DataFrame.cummin : Return cumulative minimum over DataFrame axis.
+DataFrame.cumsum : Return cumulative sum over DataFrame axis.
+DataFrame.cumprod : Return cumulative product over DataFrame axis.
Series
>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0 2.0
+1 NaN
+2 5.0
+3 -1.0
+4 0.0
+dtype: float64
+
By default, NA values are ignored.
+>>> s.cumsum()
+0 2.0
+1 NaN
+2 7.0
+3 6.0
+4 6.0
+dtype: float64
+
To include NA values in the operation, use skipna=False
>>> s.cumsum(skipna=False)
+0 2.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
DataFrame
+>>> df = pd.DataFrame([[2.0, 1.0],
+... [3.0, np.nan],
+... [1.0, 0.0]],
+... columns=list('AB'))
+>>> df
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+
By default, iterates over rows and finds the sum
+in each column. This is equivalent to axis=None
or axis='index'
.
>>> df.cumsum()
+ A B
+0 2.0 1.0
+1 5.0 NaN
+2 6.0 1.0
+
To iterate over columns and find the sum in each row,
+use axis=1
>>> df.cumsum(axis=1)
+ A B
+0 2.0 3.0
+1 3.0 NaN
+2 1.0 1.0
+
cumprod
(
axis=None
, skipna=True
, *args
, **kwargs
)
Return cumulative product over a DataFrame or Series axis.
Returns a DataFrame or Series of the same size containing the cumulative +product.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The index or the name of the axis. 0 is equivalent to None or 'index'.For Series
this parameter is unused and defaults to 0.
+skipna
+(bool, default True)
+— Exclude NA/null values. If an entire row/column is NA, the resultwill be NA.
+Return cumulative product of Series or DataFrame.
core.window.expanding.Expanding.prod : Similar functionality but ignores NaN
values.
+DataFrame.prod : Return the product over
+ DataFrame axis.
+DataFrame.cummax : Return cumulative maximum over DataFrame axis.
+DataFrame.cummin : Return cumulative minimum over DataFrame axis.
+DataFrame.cumsum : Return cumulative sum over DataFrame axis.
+DataFrame.cumprod : Return cumulative product over DataFrame axis.
Series
>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0 2.0
+1 NaN
+2 5.0
+3 -1.0
+4 0.0
+dtype: float64
+
By default, NA values are ignored.
+>>> s.cumprod()
+0 2.0
+1 NaN
+2 10.0
+3 -10.0
+4 -0.0
+dtype: float64
+
To include NA values in the operation, use skipna=False
>>> s.cumprod(skipna=False)
+0 2.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
DataFrame
+>>> df = pd.DataFrame([[2.0, 1.0],
+... [3.0, np.nan],
+... [1.0, 0.0]],
+... columns=list('AB'))
+>>> df
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+
By default, iterates over rows and finds the product
+in each column. This is equivalent to axis=None
or axis='index'
.
>>> df.cumprod()
+ A B
+0 2.0 1.0
+1 6.0 NaN
+2 6.0 0.0
+
To iterate over columns and find the product in each row,
+use axis=1
>>> df.cumprod(axis=1)
+ A B
+0 2.0 2.0
+1 3.0 NaN
+2 1.0 0.0
+
nunique
(
axis=0
, dropna=True
)
Count number of distinct elements in specified axis.
Return Series with number of distinct elements. Can ignore NaN +values.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to use. 0 or 'index' for row-wise, 1 or 'columns' forcolumn-wise.
+dropna
+(bool, default True)
+— Don't include NaN in the counts.Series.nunique: Method nunique for Series.DataFrame.count: Count non-NA cells for each column or row.
+>>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})>>> df.nunique()
+A 3
+B 2
+dtype: int64
+
>>> df.nunique(axis=1)
+0 1
+1 2
+2 2
+dtype: int64
+
idxmin
(
axis=0
, skipna=True
, numeric_only=False
)
Return index of first occurrence of minimum over requested axis.
NA/null values are excluded.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.skipna
+(bool, default True)
+— Exclude NA/null values. If an entire row/column is NA, the resultwill be NA.
+numeric_only
+(bool, default False)
+— Include only float
, int
or boolean
data.Indexes of minima along the specified axis.
ValueError
+
+— Series.idxmin : Return index of the minimum element.
Notes
+This method is the DataFrame version of ndarray.argmin
.
Consider a dataset containing food consumption in Argentina.
>>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
+... 'co2_emissions': [37.2, 19.66, 1712]},
+... index=['Pork', 'Wheat Products', 'Beef'])
+
>>> df
+ consumption co2_emissions
+Pork 10.51 37.20
+Wheat Products 103.11 19.66
+Beef 55.48 1712.00
+
By default, it returns the index for the minimum value in each column.
+>>> df.idxmin()
+consumption Pork
+co2_emissions Wheat Products
+dtype: object
+
To return the index for the minimum value in each row, use axis="columns"
.
>>> df.idxmin(axis="columns")
+Pork consumption
+Wheat Products co2_emissions
+Beef consumption
+dtype: object
+
idxmax
(
axis=0
, skipna=True
, numeric_only=False
)
Return index of first occurrence of maximum over requested axis.
NA/null values are excluded.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.skipna
+(bool, default True)
+— Exclude NA/null values. If an entire row/column is NA, the resultwill be NA.
+numeric_only
+(bool, default False)
+— Include only float
, int
or boolean
data.Indexes of maxima along the specified axis.
ValueError
+
+— Series.idxmax : Return index of the maximum element.
Notes
+This method is the DataFrame version of ndarray.argmax
.
Consider a dataset containing food consumption in Argentina.
>>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
+... 'co2_emissions': [37.2, 19.66, 1712]},
+... index=['Pork', 'Wheat Products', 'Beef'])
+
>>> df
+ consumption co2_emissions
+Pork 10.51 37.20
+Wheat Products 103.11 19.66
+Beef 55.48 1712.00
+
By default, it returns the index for the maximum value in each column.
+>>> df.idxmax()
+consumption Wheat Products
+co2_emissions Beef
+dtype: object
+
To return the index for the maximum value in each row, use axis="columns"
.
>>> df.idxmax(axis="columns")
+Pork co2_emissions
+Wheat Products consumption
+Beef co2_emissions
+dtype: object
+
mode
(
axis=0
, numeric_only=False
, dropna=True
)
Get the mode(s) of each element along the selected axis.
The mode of a set of values is the value that appears most often. +It can be multiple values.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to iterate over while searching for the mode:numeric_only
+(bool, default False)
+— If True, only apply to numeric columns.dropna
+(bool, default True)
+— Don't consider counts of NaN/NaT.The modes of each column or row.
Series.mode : Return the highest frequency value in a Series.Series.value_counts : Return the counts of values in a Series.
+>>> df = pd.DataFrame([('bird', 2, 2),... ('mammal', 4, np.nan),
+... ('arthropod', 8, 0),
+... ('bird', 2, np.nan)],
+... index=('falcon', 'horse', 'spider', 'ostrich'),
+... columns=('species', 'legs', 'wings'))
+>>> df
+ species legs wings
+falcon bird 2 2.0
+horse mammal 4 NaN
+spider arthropod 8 0.0
+ostrich bird 2 NaN
+
By default, missing values are not considered, and the mode of wings
+are both 0 and 2. Because the resulting DataFrame has two rows,
+the second row of species
and legs
contains NaN
.
>>> df.mode()
+ species legs wings
+0 bird 2.0 0.0
+1 NaN NaN 2.0
+
Setting dropna=False
NaN
values are considered and they can be
+the mode (like for wings).
>>> df.mode(dropna=False)
+ species legs wings
+0 bird 2 NaN
+
Setting numeric_only=True
, only the mode of numeric columns is
+computed, and columns of other types are ignored.
>>> df.mode(numeric_only=True)
+ legs wings
+0 2.0 0.0
+1 NaN 2.0
+
To compute the mode over columns and not rows, use the axis parameter:
+>>> df.mode(axis='columns', numeric_only=True)
+ 0 1
+falcon 2.0 NaN
+horse 4.0 NaN
+spider 0.0 8.0
+ostrich 2.0 NaN
+
quantile
(
q=0.5
, axis=0
, numeric_only=False
, interpolation='linear'
, method='single'
)
Return values at the given quantile over requested axis.
q
+(float or array-like, default 0.5 (50% quantile))
+— Value between 0 <= q <= 1, the quantile(s) to compute.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.numeric_only
+(bool, default False)
+— Include only float
, int
or boolean
data.numeric_only
is now False
.
+interpolation
+({'linear', 'lower', 'higher', 'midpoint', 'nearest'})
+— This optional parameter specifies the interpolation method to use,when the desired quantile lies between two data points i
and j
:i + (j - i) * fraction
, where fraction
is the
+ fractional part of the index surrounded by i
and j
.i
.j
.i
or j
whichever is nearest.i
+ j
) / 2.method
+({'single', 'table'}, default 'single')
+— Whether to compute quantiles per-column ('single') or over all columns('table'). When 'table', the only allowed interpolation methods are
+'nearest', 'lower', and 'higher'.
+ee +. +e +.
+core.window.rolling.Rolling.quantile: Rolling quantile.numpy.percentile: Numpy function to compute the percentile.
+>>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),... columns=['a', 'b'])
+>>> df.quantile(.1)
+a 1.3
+b 3.7
+Name: 0.1, dtype: float64
+>>> df.quantile([.1, .5])
+ a b
+0.1 1.3 3.7
+0.5 2.5 55.0
+
Specifying method='table'
will compute the quantile over all columns.
>>> df.quantile(.1, method="table", interpolation="nearest")
+a 1
+b 1
+Name: 0.1, dtype: int64
+>>> df.quantile([.1, .5], method="table", interpolation="nearest")
+ a b
+0.1 1 1
+0.5 3 100
+
Specifying numeric_only=False
will also compute the quantile of
+datetime and timedelta data.
>>> df = pd.DataFrame({'A': [1, 2],
+... 'B': [pd.Timestamp('2010'),
+... pd.Timestamp('2011')],
+... 'C': [pd.Timedelta('1 days'),
+... pd.Timedelta('2 days')]})
+>>> df.quantile(0.5, numeric_only=False)
+A 1.5
+B 2010-07-02 12:00:00
+C 1 days 12:00:00
+Name: 0.5, dtype: object
+
to_timestamp
(
freq=None
, how='start'
, axis=0
, copy=None
)
Cast to DatetimeIndex of timestamps, at beginning of period.
freq
+(str, default frequency of PeriodIndex)
+— Desired frequency.how
+({'s', 'e', 'start', 'end'})
+— Convention for converting period to timestamp; start of periodvs. end.
+axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to convert (the index by default).copy
+(bool, default True)
+— If False then underlying input data is not copied.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+The DataFrame has a DatetimeIndex.
>>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y')>>> d = {'col1': [1, 2], 'col2': [3, 4]}
+>>> df1 = pd.DataFrame(data=d, index=idx)
+>>> df1
+ col1 col2
+2023 1 3
+2024 2 4
+
The resulting timestamps will be at the beginning of the year in this case
+>>> df1 = df1.to_timestamp()
+>>> df1
+ col1 col2
+2023-01-01 1 3
+2024-01-01 2 4
+>>> df1.index
+DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None)
+
Using freq
which is the offset that the Timestamps will have
>>> df2 = pd.DataFrame(data=d, index=idx)
+>>> df2 = df2.to_timestamp(freq='M')
+>>> df2
+ col1 col2
+2023-01-31 1 3
+2024-01-31 2 4
+>>> df2.index
+DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None)
+
to_period
(
freq=None
, axis=0
, copy=None
)
Convert DataFrame from DatetimeIndex to PeriodIndex.
Convert DataFrame from DatetimeIndex to PeriodIndex with desired +frequency (inferred from index if not passed).
+freq
+(str, default)
+— Frequency of the PeriodIndex.axis
+({0 or 'index', 1 or 'columns'}, default 0)
+— The axis to convert (the index by default).copy
+(bool, default True)
+— If False then underlying input data is not copied.copy
keyword will change behavior in pandas 3.0.
+ Copy-on-Write
+ <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>
__
+ will be enabled by default, which means that all methods with a
+ copy
keyword will use a lazy copy mechanism to defer the copy and
+ ignore the copy
keyword. The copy
keyword will be removed in a
+ future version of pandas.You can already get the future behavior and improvements through
+enabling copy on write ``pd.options.mode.copy_on_write = True``
+
+The DataFrame has a PeriodIndex.
>>> idx = pd.to_datetime(... [
+... "2001-03-31 00:00:00",
+... "2002-05-31 00:00:00",
+... "2003-08-31 00:00:00",
+... ]
+... )
+
>>> idx
+DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],
+dtype='datetime64[ns]', freq=None)
+
>>> idx.to_period("M")
+PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')
+
For the yearly frequency
+>>> idx.to_period("Y")
+PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]')
+
isin
(
values
)
Whether each element in the DataFrame is contained in values.
values
+(iterable, Series, DataFrame or dict)
+— The result will only be true at a location if all thelabels match. If values
is a Series, that's the index. If
+values
is a dict, the keys must be the column names,
+which must match. If values
is a DataFrame,
+then both the index and column labels must match.
+DataFrame of booleans showing whether each element in the DataFrameis contained in values.
+DataFrame.eq: Equality test for DataFrame.Series.isin: Equivalent method on Series. +Series.str.contains: Test if pattern or regex is contained within a + string of a Series or Index.
+>>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},... index=['falcon', 'dog'])
+>>> df
+ num_legs num_wings
+falcon 2 2
+dog 4 0
+
When values
is a list check whether every value in the DataFrame
+is present in the list (which animals have 0 or 2 legs or wings)
>>> df.isin([0, 2])
+ num_legs num_wings
+falcon True True
+dog False True
+
To check if values
is not in the DataFrame, use the ~
operator:
>>> ~df.isin([0, 2])
+ num_legs num_wings
+falcon False False
+dog True False
+
When values
is a dict, we can pass values to check for each
+column separately:
>>> df.isin({'num_wings': [0, 3]})
+ num_legs num_wings
+falcon False False
+dog False True
+
When values
is a Series or DataFrame the index and column must
+match. Note that 'falcon' does not match based on the number of legs
+in other.
>>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},
+... index=['spider', 'falcon'])
+>>> df.isin(other)
+ num_legs num_wings
+falcon False True
+dog False False
+
create
(
value
)
Create a channel from a list.
The second dimension is identified by tuple. if all elements are tuple, +then a channel is created directly. Otherwise, elements are converted +to tuples first and channels are created then.
+>>> Channel.create([1, 2, 3]) # 3 rows, 1 column>>> Channel.create([(1,2,3)]) # 1 row, 3 columns
+
value
+(Union)
+— The value to create a channelA channel (dataframe)
from_glob
(
pattern
, ftype='any'
, sortby='name'
, reverse=False
)
Create a channel with a glob pattern
ftype
+(str, optional)
+— The file type, one of any, link, dir and filesortby
+(str, optional)
+— How the files should be sorted. One of name, mtime and sizereverse
+(bool, optional)
+— Whether sort them in a reversed way.The channel
from_pairs
(
pattern
, ftype='any'
, sortby='name'
, reverse=False
)
Create a width=2 channel with a glob pattern
ftype
+(str, optional)
+— The file type, one of any, link, dir and filesortby
+(str, optional)
+— How the files should be sorted. One of name, mtime and sizereverse
+(bool, optional)
+— Whether sort them in a reversed way.The channel
from_csv
(
*args
, **kwargs
)
Create a channel from a csv file
Uses pandas.read_csv() to create a channel
+*args
+
+— and**kwargs
+
+— Arguments passing to pandas.read_csv()from_excel
(
*args
, **kwargs
)
Create a channel from an excel file.
Uses pandas.read_excel() to create a channel
+*args
+
+— and**kwargs
+
+— Arguments passing to pandas.read_excel()from_table
(
*args
, **kwargs
)
Create a channel from a table file.
Uses pandas.read_table() to create a channel
+*args
+
+— and**kwargs
+
+— Arguments passing to pandas.read_table()pipen.channel.
expand_dir
(
data
, col=0
, pattern='*'
, ftype='any'
, sortby='name'
, reverse=False
)
Expand a Channel according to the files in
This is only applicable to a 1-row channel.
+>>> ch = channel.create([('./', 1)])>>> ch >> expand()
+>>> [['./a', 1], ['./b', 1], ['./c', 1]]
+
col
+(str | int, optional)
+— the index or name of the column used to expandpattern
+(str, optional)
+— use a pattern to filter the files/dirs, default: *
ftype
+(str, optional)
+— the type of the files/dirs to includesortby
+(str, optional)
+— how the list is sortedreverse
+(bool, optional)
+— reverse sort.The expanded channel
pipen.channel.
collapse_files
(
data
, col=0
)
Collapse a Channel according to the files in
Note that other values in other rows will be discarded.
+>>> ch = channel.create([['./a', 1], ['./b', 1], ['./c', 1]])>>> ch >> collapse()
+>>> [['.', 1]]
+
data
+(DataFrame)
+— The original channelcol
+(str | int, optional)
+— the index or name of the column used to collapse onThe collapsed channel
Print help for commands
CLIHelpPlugin
+
+— Print help for commands</>pipen.cli.help.
CLIHelpPlugin
(
parser
, subparser
)
Print help for commands
exec_command
(
args
)
+
+— Run the command</>parse_args
(
)
+(Namespace)
+— Define arguments for the command</>parse_args
(
)
→ NamespaceDefine arguments for the command
exec_command
(
args
)
Run the command
List plugins
CliPluginsPlugin
+
+— List installed plugins</>pipen.cli.plugins.
CliPluginsPlugin
(
parser
, subparser
)
List installed plugins
exec_command
(
args
)
+
+— Execute the command</>parse_args
(
)
+(Namespace)
+— Define arguments for the command</>parse_args
(
)
→ NamespaceDefine arguments for the command
exec_command
(
args
)
Execute the command
List available profiles.
CLIProfilePlugin
+
+— List available profiles.</>pipen.cli.profile.
CLIProfilePlugin
(
parser
, subparser
)
List available profiles.
exec_command
(
args
)
+
+— Run the command</>parse_args
(
)
+(Namespace)
+— Define arguments for the command</>parse_args
(
)
→ NamespaceDefine arguments for the command
exec_command
(
args
)
Run the command
Print help for commands
CLIVersionPlugin
+
+— Print versions of pipen and its dependencies</>pipen.cli.version.
CLIVersionPlugin
(
parser
, subparser
)
Print versions of pipen and its dependencies
exec_command
(
args
)
+
+— Run the command</>parse_args
(
)
+(Namespace)
+— Define arguments for the command</>parse_args
(
)
→ NamespaceDefine arguments for the command
exec_command
(
args
)
Run the command
List available profiles.
CLIProfilePlugin
+
+— List available profiles.</>Print help for commands
CLIHelpPlugin
+
+— Print help for commands</>Print help for commands
CLIVersionPlugin
+
+— Print versions of pipen and its dependencies</>List plugins
CliPluginsPlugin
+
+— List installed plugins</>Provide some default values/objects
ProcInputType
+
+— Types for process inputs</>ProcOutputType
+
+— Types for process outputs</>pipen.defaults.
ProcInputType
(
)
Types for process inputs
pipen.defaults.
ProcOutputType
(
)
Types for process outputs
Provide exception classes
PipenException
+
+— Base exception class for pipen</>PipenSetDataError
+
+— When trying to set input data to processes with input_data already setusing Pipen.set_data().
+</>ProcInputTypeError
+
+— When an unsupported input type is provided</>ProcInputKeyError
+
+— When an unsupported input key is provided</>ProcInputValueError
+
+— When an unsupported input value is provided</>ProcScriptFileNotFound
+
+— When script file specified as 'file://' cannot be found</>ProcOutputNameError
+
+— When no name or malformatted output is provided</>ProcOutputTypeError
+
+— When an unsupported output type is provided</>ProcOutputValueError
+
+— When a malformatted output value is provided</>ProcDependencyError
+
+— When there is something wrong the process dependencies</>NoSuchSchedulerError
+
+— When specified scheduler cannot be found</>WrongSchedulerTypeError
+
+— When specified scheduler is not a subclass of Scheduler</>NoSuchTemplateEngineError
+
+— When specified template engine cannot be found</>WrongTemplateEnginTypeError
+
+— When specified tempalte engine is not a subclass of Scheduler</>TemplateRenderingError
+
+— Failed to render a template</>ConfigurationError
+
+— When something wrong set as configuration</>PipenOrProcNameError
+
+— "When more than one processes are sharing the same workdir</>pipen.exceptions.
PipenException
(
)
Base exception class for pipen
pipen.exceptions.
PipenSetDataError
(
)
When trying to set input data to processes with input_data already setusing Pipen.set_data().
+pipen.exceptions.
ProcInputTypeError
(
)
When an unsupported input type is provided
pipen.exceptions.
ProcInputKeyError
(
)
When an unsupported input key is provided
pipen.exceptions.
ProcInputValueError
(
)
When an unsupported input value is provided
pipen.exceptions.
ProcScriptFileNotFound
(
)
When script file specified as 'file://' cannot be found
pipen.exceptions.
ProcOutputNameError
(
)
When no name or malformatted output is provided
pipen.exceptions.
ProcOutputTypeError
(
)
When an unsupported output type is provided
pipen.exceptions.
ProcOutputValueError
(
)
When a malformatted output value is provided
pipen.exceptions.
ProcDependencyError
(
)
When there is something wrong the process dependencies
pipen.exceptions.
NoSuchSchedulerError
(
)
When specified scheduler cannot be found
pipen.exceptions.
WrongSchedulerTypeError
(
)
When specified scheduler is not a subclass of Scheduler
pipen.exceptions.
NoSuchTemplateEngineError
(
)
When specified template engine cannot be found
pipen.exceptions.
WrongTemplateEnginTypeError
(
)
When specified tempalte engine is not a subclass of Scheduler
pipen.exceptions.
TemplateRenderingError
(
)
Failed to render a template
pipen.exceptions.
ConfigurationError
(
)
When something wrong set as configuration
pipen.exceptions.
PipenOrProcNameError
(
)
"When more than one processes are sharing the same workdir
The job for pipen
CMD_WRAPPER_SHELL
+
+— The shell to run the wrapped scriptCMD_WRAPPER_TEMPLATE
+
+— The template for job wrapping_error_retry
+
+— Whether we should retry if error happened_num_retries
+
+— Total number of retries_rc
+
+— The return code of the job_status
+
+— The status of the job_wrapped_cmd
+
+— The wrapped cmd, used for job submissioncached
+
+— Check if a job is cached</>cmd
+
+— The commandhook_done
+
+— Mark whether hooks have already been. Since we don't havea trigger for job finished/failed, so we do a polling on it. This
+is to avoid calling the hooks repeatedly
+index
+
+— The index of the jobjid
+
+— The jid of the job in scheduler systemjid
+(int | str | none)
+— Get the jid of the job in scheduler system</>jid_file
+(Path)
+— The jid file of the job</>metadir
+
+— The metadir of the jobrc
+(int)
+— The return code of the job</>rc_file
+(Path)
+— The rc file of the job</>retry_dir
+(Path)
+— The retry directory of the job</>script_file
+
+— Get the path to script file</>signature_file
+
+— Get the path to the signature file</>status
+(int)
+— Query the status of the jobstatus_file
+(Path)
+— The status file of the job</>stderr_file
+(Path)
+— The stderr file of the job</>stdout_file
+(Path)
+— The stdout file of the job</>strcmd
+(str)
+— Get the string representation of the command</>trial_count
+
+— The count for re-tries__repr__
(
)
+(str)
+— repr of the job</>cache
(
)
+
+— write signature to signature file</>clean
(
retry
)
+
+— Clean up the meta files</>log
(
level
, msg
, *args
, limit
, limit_indicator
, logger
)
+
+— Log message for the jobs</>prepare
(
proc
)
+
+— Prepare the job by given process</>shebang
(
scheduler
)
+(str)
+— The shebang of the wrapped script</>wrapped_script
(
scheduler
)
+(PathLike)
+— Get the wrapped script</>abc.
ABCMeta
(
name
, bases
, namespace
, **kwargs
)
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed +directly, and then acts as a mix-in class. You can also register +unrelated concrete classes (even built-in classes) and unrelated +ABCs as 'virtual subclasses' -- these and their descendants will +be considered subclasses of the registering ABC by the built-in +issubclass() function, but the registering ABC won't show up in +their MRO (Method Resolution Order) nor will method +implementations defined by the registering ABC be callable (not +even via super()).
+__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
)
→ strrepr of the job
shebang
(
scheduler
)
→ strThe shebang of the wrapped script
clean
(
retry=False
)
Clean up the meta files
retry
+(optional)
+— Whether clean it for retryingwrapped_script
(
scheduler
)
Get the wrapped script
scheduler
+(Scheduler)
+— The schedulerThe path of the wrapped script
log
(
level
, msg
, *args
, limit=3
, limit_indicator=True
, logger=<LoggerAdapter pipen.core (WARNING)>
)
Log message for the jobs
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelimit
+(int, optional)
+— limitation of the log (don't log for all jobs)limit_indicator
+(bool, optional)
+— Whether to show an indicator saying the loghas been limited (the level of the indicator will be DEBUG)
+logger
+(LoggerAdapter, optional)
+— The logger used to logprepare
(
proc
)
Prepare the job by given process
Primarily prepare the script, and provide cmd to the job for xqute +to wrap and run
+proc
+(Proc)
+— the process objectpipen.pipen.
Pipen
(
name=None
, desc=None
, outdir=None
, **kwargs
)
The Pipen class provides interface to assemble and run the pipeline
PIPELINE_COUNT
+
+— How many pipelines are loadedSETUP
+
+— Whether the one-time setup hook is called_kwargs
+
+— The extra configrations passed to overwrite the default onesconfig
+
+— The configurationsdesc
+
+— The description of the pipelinename
+
+— The name of the pipelineoutdir
+
+— The output directory of the resultspbar
+
+— The progress barprocs
+
+— The processesprofile
+
+— The profile of the configurations to run the pipelinestarts
+
+— The start processesworkdir
+
+— The workdir for the pipelinename
+(str | none, optional)
+— The name of the pipelinedesc
+(str | none, optional)
+— The description of the pipelineoutdir
+(str | os.pathlike, optional)
+— The output directory of the results**kwargs
+
+— Other configurations__init_subclass__
(
)
+
+— This method is called when a class is subclassed.</>async_run
(
profile
)
+(bool)
+— Run the processes one by one</>build_proc_relationships
(
)
+
+— Build the proc relationships for the pipeline</>run
(
profile
)
+(bool)
+— Run the pipeline with the given profileThis is just a sync wrapper for the async async_run
function using
+asyncio.run()
+</>set_data
(
*indata
)
+(Pipen)
+— Set the input_data for start processes</>set_starts
(
*procs
, clear
)
+
+— Set the starts</>__init_subclass__
(
)
This method is called when a class is subclassed.
The default implementation does nothing. It may be +overridden to extend subclasses.
+async_run
(
profile='default'
)
Run the processes one by one
profile
+(str, optional)
+— The default profile to use for the runTrue if the pipeline ends successfully else False
run
(
profile='default'
)
Run the pipeline with the given profileThis is just a sync wrapper for the async async_run
function using
+asyncio.run()
profile
+(str, optional)
+— The default profile to use for the runTrue if the pipeline ends successfully else False
set_data
(
*indata
)
Set the input_data for start processes
*indata
+(Any)
+— The input data for the start processesThe data will set for the processes in the order determined by
+set_starts()
.
+If a process has input_data set, an error will be raised.
+To use that input_data, set None here in the corresponding
+position for the process
+ProcInputDataError
+
+— When trying to set input data toprocesses with input_data already set
+self
to chain the operations
set_starts
(
*procs
, clear=True
)
Set the starts
*procs
+(Union)
+— The processes to set as starts of the pipeline.clear
+(bool, optional)
+— Wether to clear previous set startsProcDependencyError
+
+— When processes set as starts repeatedlyself
to chain the operations
build_proc_relationships
(
)
Build the proc relationships for the pipeline
pipen.pipen.
run
(
name
, starts
, data=None
, desc=None
, outdir=None
, profile='default'
, **kwargs
)
Shortcut to run a pipeline
name
+(str)
+— The name of the pipelinestarts
+(Union)
+— The start processesdata
+(Iterable, optional)
+— The input data for the start processesdesc
+(str, optional)
+— The description of the pipelineoutdir
+(str | os.pathlike, optional)
+— The output directory of the resultsprofile
+(str, optional)
+— The profile to use**kwargs
+
+— Other options pass to Pipen to create the pipelineTrue if the pipeline ends successfully else False
Define hooks specifications and provide plugin manager
PipenMainPlugin
+
+— The builtin core plugin, used to update the progress bar andcache the job
+</>XqutePipenPlugin
+
+— The plugin for xqute working as proxy for pipen plugin hooks</>clear_path
(
job
, path
, is_dir
)
+(bool)
+— Clear the path, either a file or a directory</>get_mtime
(
job
, path
, dirsig
)
+(float)
+— Get the mtime of a path, either a file or a directory</>norm_inpath
(
job
, inpath
, is_dir
)
+(str)
+— Normalize the input path</>norm_outpath
(
job
, outpath
, is_dir
)
+(str)
+— Normalize the output path</>on_complete
(
pipen
, succeeded
)
+
+— The the pipeline is completed.</>on_init
(
pipen
)
+
+— When the pipeline is initialized, and default configs are loaded</>on_job_cached
(
job
)
+
+— When a job is cached.</>on_job_failed
(
job
)
+
+— When a job is done but failed.</>on_job_init
(
job
)
+
+— When a job is initialized</>on_job_killed
(
job
)
+
+— When a job is killed</>on_job_killing
(
job
)
+(bool)
+— When a job is being killed.</>on_job_polling
(
job
)
+
+— When status of a job is being polled.</>on_job_queued
(
job
)
+
+— When a job is queued in xqute. Note it might not be queued yet inthe scheduler system.
+</>on_job_started
(
job
)
+
+— When a job starts to run in then scheduler system.</>on_job_submitted
(
job
)
+
+— When a job is submitted in the scheduler system.</>on_job_submitting
(
job
)
+(bool)
+— When a job is submitting.</>on_job_succeeded
(
job
)
+
+— When a job completes successfully.</>on_jobcmd_end
(
job
)
+(str)
+— When the job command finishes and after the postscript is run</>on_jobcmd_init
(
job
)
+(str)
+— When the job command wrapper script is initialized before the prescript is run</>on_jobcmd_prep
(
job
)
+(str)
+— When the job command right about to be run</>on_proc_create
(
proc
)
+
+— Called Proc constructor when a process is created.</>on_proc_done
(
proc
, succeeded
)
+
+— When a process is done</>on_proc_init
(
proc
)
+
+— Called when a process is initialized.</>on_proc_input_computed
(
proc
)
+
+— Called after process input data is computed.</>on_proc_script_computed
(
proc
)
+
+— Called after process script is computed.</>on_proc_shutdown
(
proc
, sig
)
+
+— When pipeline is shutting down, by Ctrl-c for example.</>on_proc_start
(
proc
)
+
+— When a process is starting</>on_setup
(
config
)
+
+— Setup for plugins, primarily used for the plugins tosetup some default configurations.
+</>on_start
(
pipen
)
+
+— Right before the pipeline starts running.</>output_exists
(
job
, path
, is_dir
)
+(bool)
+— Check if the output exists</>pipen.pluginmgr.
on_setup
(
config
)
Setup for plugins, primarily used for the plugins tosetup some default configurations.
+This is only called once for all pipelines.
+config
+(Dict)
+— The configuration dictionaryplugin options should be defined under "plugin_opts"
+One should define a configuration item either with a prefix as
+the identity for the plugin or a namespace inside the plugin config
+pipen.pluginmgr.
on_init
(
pipen
)
When the pipeline is initialized, and default configs are loaded
pipen
+(Pipen)
+— The Pipen objectpipen.pluginmgr.
on_start
(
pipen
)
Right before the pipeline starts running.
Process relationships are inferred.
+pipen
+(Pipen)
+— The Pipen objectpipen.pluginmgr.
on_complete
(
pipen
, succeeded
)
The the pipeline is completed.
pipen
+(Pipen)
+— The Pipen objectsucceeded
+(bool)
+— Whether the pipeline has successfully completed.pipen.pluginmgr.
on_proc_create
(
proc
)
Called Proc constructor when a process is created.
Enables plugins to modify the default attributes of processes
+proc
+(Proc)
+— The Proc objectpipen.pluginmgr.
on_proc_init
(
proc
)
Called when a process is initialized.
Allows plugins to modify the process attributes after initialization, but +before the jobs are initialized.
+proc
+(Proc)
+— The Proc objectpipen.pluginmgr.
on_proc_input_computed
(
proc
)
Called after process input data is computed.
proc
+(Proc)
+— The Proc objectpipen.pluginmgr.
on_proc_script_computed
(
proc
)
Called after process script is computed.
The script is computed as a string that is about to compiled into a +template.
+proc
+(Proc)
+— The Proc objectpipen.pluginmgr.
on_proc_start
(
proc
)
When a process is starting
proc
+(Proc)
+— The processpipen.pluginmgr.
on_proc_shutdown
(
proc
, sig
)
When pipeline is shutting down, by Ctrl-c for example.
Return False to stop shutting down, but you have to shut it down
+by yourself, for example, proc.xqute.task.cancel()
Only the first return value will be used.
+sig
+(signal.Signals)
+— The signal. None
means a natural shutdownpipen
+
+— The xqute objectpipen.pluginmgr.
on_proc_done
(
proc
, succeeded
)
When a process is done
proc
+(Proc)
+— The processsucceeded
+(bool | str)
+— Whether the process succeeded or not. 'cached' if all jobsare cached.
+pipen.pluginmgr.
on_job_init
(
job
)
When a job is initialized
job
+(Job)
+— The jobpipen.pluginmgr.
on_job_queued
(
job
)
When a job is queued in xqute. Note it might not be queued yet inthe scheduler system.
+job
+(Job)
+— The jobpipen.pluginmgr.
on_job_submitting
(
job
)
When a job is submitting.
The first plugin (based on priority) have this hook return False will +cancel the submission
+job
+(Job)
+— The jobFalse to cancel submission
pipen.pluginmgr.
on_job_submitted
(
job
)
When a job is submitted in the scheduler system.
job
+(Job)
+— The jobpipen.pluginmgr.
on_job_started
(
job
)
When a job starts to run in then scheduler system.
Note that the job might not be running yet in the scheduler system.
+job
+(Job)
+— The jobpipen.pluginmgr.
on_job_polling
(
job
)
When status of a job is being polled.
job
+(Job)
+— The jobpipen.pluginmgr.
on_job_killing
(
job
)
When a job is being killed.
The first plugin (based on priority) have this hook return False will +cancel the killing
+job
+(Job)
+— The jobFalse to cancel killing
pipen.pluginmgr.
on_job_killed
(
job
)
When a job is killed
job
+(Job)
+— The jobpipen.pluginmgr.
on_job_succeeded
(
job
)
When a job completes successfully.
job
+(Job)
+— The jobpipen.pluginmgr.
on_job_cached
(
job
)
When a job is cached.
job
+(Job)
+— The jobpipen.pluginmgr.
on_job_failed
(
job
)
When a job is done but failed.
job
+(Job)
+— The jobpipen.pluginmgr.
norm_inpath
(
job
, inpath
, is_dir
)
Normalize the input path
job
+(Job)
+— The jobinpath
+(str | os.pathlike)
+— The input pathis_dir
+(bool)
+— Whether the path is a directoryThe normalized path
pipen.pluginmgr.
norm_outpath
(
job
, outpath
, is_dir
)
Normalize the output path
job
+(Job)
+— The joboutpath
+(str)
+— The output pathis_dir
+(bool)
+— Whether the path is a directoryThe normalized path
pipen.pluginmgr.
get_mtime
(
job
, path
, dirsig
)
Get the mtime of a path, either a file or a directory
job
+(Job)
+— The jobpath
+(str | os.pathlike)
+— The path to get mtimedirsig
+(int)
+— The depth of the directory to check the last modification timeThe last modification time
pipen.pluginmgr.
clear_path
(
job
, path
, is_dir
)
Clear the path, either a file or a directory
job
+(Job)
+— The jobpath
+(str | os.pathlike)
+— The path to clearis_dir
+(bool)
+— Whether the path is a directoryWhether the path is cleared successfully
pipen.pluginmgr.
output_exists
(
job
, path
, is_dir
)
Check if the output exists
job
+(Job)
+— The jobpath
+(str)
+— The path to checkis_dir
+(bool)
+— Whether the path is a directoryWhether the output exists
pipen.pluginmgr.
on_jobcmd_init
(
job
)
When the job command wrapper script is initialized before the prescript is run
This should return a piece of bash code to be inserted in the wrapped job
+script (template), which is a python template string, with the following
+variables available: status
and job
. status
is the class JobStatus
from
+xqute.defaults.py
and job
is the Job
instance.
For multiple plugins, the code will be inserted in the order of the plugin priority.
+The code will replace the #![jobcmd_init]
placeholder in the wrapped job script.
+See also https://github.com/pwwang/xqute/blob/master/xqute/defaults.py#L95
job
+(Job)
+— The job objectThe bash code to be inserted
pipen.pluginmgr.
on_jobcmd_prep
(
job
)
When the job command right about to be run
This should return a piece of bash code to be inserted in the wrapped job
+script (template), which is a python template string, with the following
+variables available: status
and job
. status
is the class JobStatus
from
+xqute.defaults.py
and job
is the Job
instance.
The bash variable $cmd
is accessible in the context. It is also possible to
+modify the cmd
variable. Just remember to assign the modified value to cmd
.
For multiple plugins, the code will be inserted in the order of the plugin priority.
+Keep in mind that the $cmd
may be modified by other plugins.
The code will replace the #![jobcmd_prep]
placeholder in the wrapped job script.
+See also https://github.com/pwwang/xqute/blob/master/xqute/defaults.py#L95
job
+(Job)
+— The job objectThe bash code to be inserted
pipen.pluginmgr.
on_jobcmd_end
(
job
)
When the job command finishes and after the postscript is run
This should return a piece of bash code to be inserted in the wrapped job
+script (template), which is a python template string, with the following
+variables available: status
and job
. status
is the class JobStatus
from
+xqute.defaults.py
and job
is the Job
instance.
The bash variable $rc
is accessible in the context, which is the return code
+of the job command.
For multiple plugins, the code will be inserted in the order of the plugin priority.
+The code will replace the #![jobcmd_end]
placeholder in the wrapped job script.
+See also https://github.com/pwwang/xqute/blob/master/xqute/defaults.py#L95
job
+(Job)
+— The job objectThe bash code to be inserted
pipen.pluginmgr.
PipenMainPlugin
(
)
The builtin core plugin, used to update the progress bar andcache the job
+pipen.pluginmgr.
XqutePipenPlugin
(
)
The plugin for xqute working as proxy for pipen plugin hooks
pipen.proc.
ProcMeta
(
name
, bases
, namespace
, **kwargs
)
Meta class for Proc
__call__
(
cls
, *args
, **kwds
)
+(Proc)
+— Make sure Proc subclasses are singletons</>__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__repr__
(
cls
)
+(str)
+— Representation for the Proc subclasses</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
cls
)
→ strRepresentation for the Proc subclasses
pipen.proc.
Proc
(
*args
, **kwds
)
→ ProcThe abstract class for processes.
It's an abstract class. You can't instantise a process using it directly. +You have to subclass it. The subclass itself can be used as a process +directly.
+Each subclass is a singleton, so to intantise a new process, each subclass
+an existing Proc
subclass, or use Proc.from_proc()
.
Never use the constructor directly. The Proc is designed +as a singleton class, and is instansiated internally.
+cache
+
+— Should we detect whether the jobs are cached?desc
+
+— The description of the process. Will use the summary fromthe docstring by default.
+dirsig
+
+— When checking the signature for caching, whether should we walkthrough the content of the directory? This is sometimes
+time-consuming if the directory is big.
+envs
+
+— The arguments that are job-independent, useful for common optionsacross jobs.
+envs_depth
+
+— How deep to update the envs when subclassed.error_strategy
+
+— How to deal with the errorsexport
+
+— When True, the results will be exported to <pipeline.outdir>
Defaults to None, meaning only end processes will export.
+You can set it to True/False to enable or disable exporting
+for processes
+forks
+
+— How many jobs to run simultaneously?input
+
+— The keys for the input channelinput_data
+
+— The input data (will be computed for dependent processes)lang
+
+— The language for the script to run. Should be the path to theinterpreter if lang
is not in $PATH
.
+name
+
+— The name of the process. Will use the class name by default.nexts
+
+— Computed from requires
to build the process relationshipsnum_retries
+
+— How many times to retry to jobs once error occursorder
+
+— The execution order for this process. The bigger the numberis, the later the process will be executed. Default: 0.
+Note that the dependent processes will always be executed first.
+This doesn't work for start processes either, whose orders are
+determined by Pipen.set_starts()
+output
+
+— The output keys for the output channel(the data will be computed)
+output_data
+
+— The output data (to pass to the next processes)plugin_opts
+
+— Options for process-level pluginsrequires
+
+— The dependency processesscheduler
+
+— The scheduler to run the jobsscheduler_opts
+
+— The options for the schedulerscript
+
+— The script template for the processsubmission_batch
+
+— How many jobs to be submited simultaneouslytemplate
+
+— Define the template engine to use.This could be either a template engine or a dict with key engine
+indicating the template engine and the rest the arguments passed
+to the constructor of the pipen.template.Template
object.
+The template engine could be either the name of the engine,
+currently jinja2 and liquidpy are supported, or a subclass of
+pipen.template.Template
.
+You can subclass pipen.template.Template
to use your own template
+engine.
+__init_subclass__
(
)
+
+— Do the requirements inferring since we need them to build up theprocess relationship
+</>from_proc
(
proc
, name
, desc
, envs
, envs_depth
, cache
, export
, error_strategy
, num_retries
, forks
, input_data
, order
, plugin_opts
, requires
, scheduler
, scheduler_opts
, submission_batch
)
+(Type)
+— Create a subclass of Proc using another Proc subclass or Proc itself</>gc
(
)
+
+— GC process for the process to save memory after it's done</>init
(
)
+
+— Init all other properties and jobs</>log
(
level
, msg
, *args
, logger
)
+
+— Log message for the process</>run
(
)
+
+— Run the process</>pipen.proc.
ProcMeta
(
name
, bases
, namespace
, **kwargs
)
Meta class for Proc
__call__
(
cls
, *args
, **kwds
)
+(Proc)
+— Make sure Proc subclasses are singletons</>__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__repr__
(
cls
)
+(str)
+— Representation for the Proc subclasses</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
cls
)
→ strRepresentation for the Proc subclasses
from_proc
(
proc
, name=None
, desc=None
, envs=None
, envs_depth=None
, cache=None
, export=None
, error_strategy=None
, num_retries=None
, forks=None
, input_data=None
, order=None
, plugin_opts=None
, requires=None
, scheduler=None
, scheduler_opts=None
, submission_batch=None
)
Create a subclass of Proc using another Proc subclass or Proc itself
proc
+(Type)
+— The Proc subclassname
+(str, optional)
+— The new name of the processdesc
+(str, optional)
+— The new description of the processenvs
+(Mapping, optional)
+— The arguments of the process, will overwrite parent oneThe items that are specified will be inherited
+envs_depth
+(int, optional)
+— How deep to update the envs when subclassed.cache
+(bool, optional)
+— Whether we should check the cache for the jobsexport
+(bool, optional)
+— When True, the results will be exported to<pipeline.outdir>
+Defaults to None, meaning only end processes will export.
+You can set it to True/False to enable or disable exporting
+for processes
+error_strategy
+(str, optional)
+— How to deal with the errorsnum_retries
+(int, optional)
+— How many times to retry to jobs once error occursforks
+(int, optional)
+— New forks for the new processinput_data
+(Any, optional)
+— The input data for the process. Only when this processis a start process
+order
+(int, optional)
+— The order to execute the new processplugin_opts
+(Mapping, optional)
+— The new plugin options, unspecified items will beinherited.
+requires
+(Sequence, optional)
+— The required processes for the new processscheduler
+(str, optional)
+— The new shedular to run the new processscheduler_opts
+(Mapping, optional)
+— The new scheduler options, unspecified items willbe inherited.
+submission_batch
+(int, optional)
+— How many jobs to be submited simultaneouslyThe new process class
__init_subclass__
(
)
Do the requirements inferring since we need them to build up theprocess relationship
+init
(
)
Init all other properties and jobs
gc
(
)
GC process for the process to save memory after it's done
log
(
level
, msg
, *args
, logger=<LoggerAdapter pipen.core (WARNING)>
)
Log message for the process
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelogger
+(LoggerAdapter, optional)
+— The logging loggerrun
(
)
Run the process
Process group that contains a set of processes.
It can be easily used to create a pipeline that runs independently or +integrated into a larger pipeline.
+Runs directly: +
>>> proc_group = ProcGroup(<options>)
+>>> proc_group.as_pipen(<pipeline options>).set_data(<data>).run()
+
Integrated into a larger pipeline +
>>> proc_group = ProcGroup(<options>)
+>>> # proc could be a process within the larger pipeline
+>>> proc.requires = prog_group.<proc>
+
To add a process to the proc group, use the add_proc
method:
+
>>> class MyProcGroup(ProcGroup):
+>>> ...
+>>>
+>>> proc_group = MyProcGroup(...)
+>>> @proc_group.add_proc
+>>> class MyProc(Proc):
+>>> ...
+
Or add a process at runtime: +
>>> class MyProcGroup(ProcGroup):
+>>> ...
+>>>
+>>> @ProcGroup.add_proc
+>>> def my_proc(self):
+>>> class MyProc(Proc):
+>>> # You may use self.options here
+>>> ...
+>>> return MyProc
+>>> proc_group = MyProcGroup(...)
+
ProcGropuMeta
+
+— Meta class for ProcGroup</>ProcGroup
+
+— A group of processes that can be run independently orintegrated into a larger pipeline.
+</>pipen.procgroup.
ProcGropuMeta
(
name
, bases
, namespace
, **kwargs
)
Meta class for ProcGroup
__call__
(
cls
, *args
, **kwds
)
+
+— Make sure Proc subclasses are singletons</>__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__call__
(
cls
, *args
, **kwds
)
Make sure Proc subclasses are singletons
*args
+
+— and**kwds
+
+— Arguments for the constructorThe Proc instance
pipen.procgroup.
ProcGroup
(
*args
, **kwds
)
A group of processes that can be run independently orintegrated into a larger pipeline.
+ProcGropuMeta
+
+— Meta class for ProcGroup</>pipen.procgroup.
ProcGropuMeta
(
name
, bases
, namespace
, **kwargs
)
Meta class for ProcGroup
__call__
(
cls
, *args
, **kwds
)
+
+— Make sure Proc subclasses are singletons</>__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__call__
(
cls
, *args
, **kwds
)
Make sure Proc subclasses are singletons
*args
+
+— and**kwds
+
+— Arguments for the constructorThe Proc instance
__init_subclass__
(
)
This method is called when a class is subclassed.
The default implementation does nothing. It may be +overridden to extend subclasses.
+add_proc
(
self_or_method
, proc=None
)
Add a process to the proc group
It works either as a decorator to the process directly or as a +decorator to a method that returns the process.
+self_or_method
+(Union)
+— The proc group instance or a method thatreturns the process
+proc
+(Optional, optional)
+— The process class if self_or_method
is the proc groupThe process class if self_or_method
is the proc group, ora cached property that returns the process class
as_pipen
(
name=None
, desc=None
, outdir=None
, **kwargs
)
Convert the pipeline to a Pipen instance
name
+(str | none, optional)
+— The name of the pipelinedesc
+(str | none, optional)
+— The description of the pipelineoutdir
+(str | os.pathlike | none, optional)
+— The output directory of the pipeline**kwargs
+
+— The keyword arguments to pass to PipenThe Pipen instance
Provide the PipelinePBar and ProcPBar classes
ProcPBar
+
+— The progress bar for processes</>PipelinePBar
+
+— Progress bar for the pipeline</>pipen.progressbar.
ProcPBar
(
manager
, proc_size
, proc_name
)
The progress bar for processes
done
(
)
+
+— The process is done</>update_job_failed
(
)
+
+— Update the progress bar when a job is failed</>update_job_retrying
(
)
+
+— Update the progress bar when a job is retrying</>update_job_running
(
)
+
+— Update the progress bar when a job is running</>update_job_submitted
(
)
+
+— Update the progress bar when a job is submitted</>update_job_succeeded
(
)
+
+— Update the progress bar when a job is succeeded</>update_job_submitted
(
)
Update the progress bar when a job is submitted
update_job_retrying
(
)
Update the progress bar when a job is retrying
update_job_running
(
)
Update the progress bar when a job is running
update_job_succeeded
(
)
Update the progress bar when a job is succeeded
update_job_failed
(
)
Update the progress bar when a job is failed
done
(
)
The process is done
pipen.progressbar.
PipelinePBar
(
n_procs
, ppln_name
)
Progress bar for the pipeline
done
(
)
+
+— When the pipeline is done</>proc_bar
(
proc_size
, proc_name
)
+(ProcPBar)
+— Get the progress bar for a process</>update_proc_done
(
)
+
+— Update the progress bar when a process is done</>update_proc_error
(
)
+
+— Update the progress bar when a process is errored</>update_proc_running
(
)
+
+— Update the progress bar when a process is running</>update_proc_running
(
)
Update the progress bar when a process is running
update_proc_done
(
)
Update the progress bar when a process is done
update_proc_error
(
)
Update the progress bar when a process is errored
done
(
)
When the pipeline is done
Provide builting schedulers
LocalJob
+
+— Job class for local scheduler</>LocalScheduler
+
+— Local scheduler</>SgeJob
+
+— Job class for SGE scheduler</>SgeScheduler
+
+— SGE scheduler</>SlurmJob
+
+— Job class for Slurm scheduler</>SlurmScheduler
+
+— Slurm scheduler</>SshJob
+
+— Job class for SSH scheduler</>SshScheduler
+
+— SSH scheduler</>get_scheduler
(
scheduler
)
+(Type)
+— Get the scheduler by name of the scheduler class itself</>pipen.scheduler.
LocalJob
(
*args
, **kwargs
)
Job class for local scheduler
CMD_WRAPPER_SHELL
+
+— The shell to run the wrapped scriptCMD_WRAPPER_TEMPLATE
+
+— The template for job wrapping_error_retry
+
+— Whether we should retry if error happened_num_retries
+
+— Total number of retries_rc
+
+— The return code of the job_status
+
+— The status of the job_wrapped_cmd
+
+— The wrapped cmd, used for job submissioncached
+
+— Check if a job is cached</>cmd
+
+— The commandhook_done
+
+— Mark whether hooks have already been. Since we don't havea trigger for job finished/failed, so we do a polling on it. This
+is to avoid calling the hooks repeatedly
+index
+
+— The index of the jobjid
+
+— The jid of the job in scheduler systemjid
+(int | str | none)
+— Get the jid of the job in scheduler system</>jid_file
+(Path)
+— The jid file of the job</>metadir
+
+— The metadir of the jobrc
+(int)
+— The return code of the job</>rc_file
+(Path)
+— The rc file of the job</>retry_dir
+(Path)
+— The retry directory of the job</>script_file
+
+— Get the path to script file</>signature_file
+
+— Get the path to the signature file</>status
+(int)
+— Query the status of the jobstatus_file
+(Path)
+— The status file of the job</>stderr_file
+(Path)
+— The stderr file of the job</>stdout_file
+(Path)
+— The stdout file of the job</>strcmd
+(str)
+— Get the string representation of the command</>trial_count
+
+— The count for re-tries__repr__
(
)
+(str)
+— repr of the job</>cache
(
)
+
+— write signature to signature file</>clean
(
retry
)
+
+— Clean up the meta files</>log
(
level
, msg
, *args
, limit
, limit_indicator
, logger
)
+
+— Log message for the jobs</>prepare
(
proc
)
+
+— Prepare the job by given process</>shebang
(
scheduler
)
+(str)
+— The shebang of the wrapped script</>wrapped_script
(
scheduler
)
+(PathLike)
+— Get the wrapped script</>abc.
ABCMeta
(
name
, bases
, namespace
, **kwargs
)
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed +directly, and then acts as a mix-in class. You can also register +unrelated concrete classes (even built-in classes) and unrelated +ABCs as 'virtual subclasses' -- these and their descendants will +be considered subclasses of the registering ABC by the built-in +issubclass() function, but the registering ABC won't show up in +their MRO (Method Resolution Order) nor will method +implementations defined by the registering ABC be callable (not +even via super()).
+__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
)
→ strrepr of the job
shebang
(
scheduler
)
→ strThe shebang of the wrapped script
clean
(
retry=False
)
Clean up the meta files
retry
+(optional)
+— Whether clean it for retryingwrapped_script
(
scheduler
)
Get the wrapped script
scheduler
+(Scheduler)
+— The schedulerThe path of the wrapped script
log
(
level
, msg
, *args
, limit=3
, limit_indicator=True
, logger=<LoggerAdapter pipen.core (WARNING)>
)
Log message for the jobs
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelimit
+(int, optional)
+— limitation of the log (don't log for all jobs)limit_indicator
+(bool, optional)
+— Whether to show an indicator saying the loghas been limited (the level of the indicator will be DEBUG)
+logger
+(LoggerAdapter, optional)
+— The logger used to logpipen.scheduler.
LocalScheduler
(
forks
, prescript=''
, postscript=''
, **kwargs
)
Local scheduler
forks
+(int)
+— Max number of job forksprescript
+(str, optional)
+— The script to run before the commandpostscript
+(str, optional)
+— The script to run after the command**kwargs
+
+— Other arguments for the schedulerjob_class
+
+— The job classname
+
+— The name of the schedulerjob_is_running
(
job
)
+(bool)
+— Tell if a job is really running, not only the job.jid_file</>job_is_submitted_or_running
(
job
)
+(bool)
+— Check if a job is already submitted or running</>kill_job
(
job
)
+
+— Kill a job asynchronously</>kill_job_and_update_status
(
job
)
+
+— Kill a job and update its status</>kill_running_jobs
(
jobs
)
+
+— Try to kill all running jobs</>polling_jobs
(
jobs
, on
, halt_on_error
)
+(bool)
+— Check if all jobs are done or new jobs can submit</>retry_job
(
job
)
+
+— Retry a job</>submit_job
(
job
)
+(int)
+— Submit a job locally</>submit_job_and_update_status
(
job
)
+
+— Submit and update the status</>submit_job_and_update_status
(
job
)
Submit and update the status
job
+(Job)
+— The jobretry_job
(
job
)
Retry a job
job
+(Job)
+— The jobkill_job_and_update_status
(
job
)
Kill a job and update its status
job
+(Job)
+— The jobpolling_jobs
(
jobs
, on
, halt_on_error
)
Check if all jobs are done or new jobs can submit
jobs
+(List)
+— The list of jobson
+(str)
+— query on status: can_submit
or all_done
halt_on_error
+(bool)
+— Whether we should halt the whole pipeline on errorTrue if yes otherwise False.
kill_running_jobs
(
jobs
)
Try to kill all running jobs
jobs
+(List)
+— The list of jobsjob_is_submitted_or_running
(
job
)
Check if a job is already submitted or running
job
+(Job)
+— The jobTrue if yes otherwise False.
submit_job
(
job
)
Submit a job locally
job
+(Job)
+— The jobThe process id
kill_job
(
job
)
Kill a job asynchronously
job
+(Job)
+— The jobjob_is_running
(
job
)
Tell if a job is really running, not only the job.jid_file
In case where the jid file is not cleaned when job is done.
+job
+(Job)
+— The jobTrue if it is, otherwise False
pipen.scheduler.
LocalJob
(
*args
, **kwargs
)
Job class for local scheduler
cached
+
+— Check if a job is cached</>jid
+(int | str | none)
+— Get the jid of the job in scheduler system</>jid_file
+(Path)
+— The jid file of the job</>rc
+(int)
+— The return code of the job</>rc_file
+(Path)
+— The rc file of the job</>retry_dir
+(Path)
+— The retry directory of the job</>script_file
+
+— Get the path to script file</>signature_file
+
+— Get the path to the signature file</>status
+(int)
+— Query the status of the jobstatus_file
+(Path)
+— The status file of the job</>stderr_file
+(Path)
+— The stderr file of the job</>stdout_file
+(Path)
+— The stdout file of the job</>strcmd
+(str)
+— Get the string representation of the command</>__repr__
(
)
+(str)
+— repr of the job</>cache
(
)
+
+— write signature to signature file</>clean
(
retry
)
+
+— Clean up the meta files</>log
(
level
, msg
, *args
, limit
, limit_indicator
, logger
)
+
+— Log message for the jobs</>prepare
(
proc
)
+
+— Prepare the job by given process</>shebang
(
scheduler
)
+(str)
+— The shebang of the wrapped script</>wrapped_script
(
scheduler
)
+(PathLike)
+— Get the wrapped script</>abc.
ABCMeta
(
name
, bases
, namespace
, **kwargs
)
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed +directly, and then acts as a mix-in class. You can also register +unrelated concrete classes (even built-in classes) and unrelated +ABCs as 'virtual subclasses' -- these and their descendants will +be considered subclasses of the registering ABC by the built-in +issubclass() function, but the registering ABC won't show up in +their MRO (Method Resolution Order) nor will method +implementations defined by the registering ABC be callable (not +even via super()).
+__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
)
→ strrepr of the job
shebang
(
scheduler
)
→ strThe shebang of the wrapped script
clean
(
retry=False
)
Clean up the meta files
retry
+(optional)
+— Whether clean it for retryingwrapped_script
(
scheduler
)
Get the wrapped script
scheduler
+(Scheduler)
+— The schedulerThe path of the wrapped script
Log message for the jobs
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelimit
+(int, optional)
+— limitation of the log (don't log for all jobs)limit_indicator
+(bool, optional)
+— Whether to show an indicator saying the loghas been limited (the level of the indicator will be DEBUG)
+logger
+(LoggerAdapter, optional)
+— The logger used to logpipen.scheduler.
SgeJob
(
*args
, **kwargs
)
Job class for SGE scheduler
CMD_WRAPPER_SHELL
+
+— The shell to run the wrapped scriptCMD_WRAPPER_TEMPLATE
+
+— The template for job wrapping_error_retry
+
+— Whether we should retry if error happened_num_retries
+
+— Total number of retries_rc
+
+— The return code of the job_status
+
+— The status of the job_wrapped_cmd
+
+— The wrapped cmd, used for job submissioncached
+
+— Check if a job is cached</>cmd
+
+— The commandhook_done
+
+— Mark whether hooks have already been. Since we don't havea trigger for job finished/failed, so we do a polling on it. This
+is to avoid calling the hooks repeatedly
+index
+
+— The index of the jobjid
+
+— The jid of the job in scheduler systemjid
+(int | str | none)
+— Get the jid of the job in scheduler system</>jid_file
+(Path)
+— The jid file of the job</>metadir
+
+— The metadir of the jobrc
+(int)
+— The return code of the job</>rc_file
+(Path)
+— The rc file of the job</>retry_dir
+(Path)
+— The retry directory of the job</>script_file
+
+— Get the path to script file</>signature_file
+
+— Get the path to the signature file</>status
+(int)
+— Query the status of the jobstatus_file
+(Path)
+— The status file of the job</>stderr_file
+(Path)
+— The stderr file of the job</>stdout_file
+(Path)
+— The stdout file of the job</>strcmd
+(str)
+— Get the string representation of the command</>trial_count
+
+— The count for re-tries__repr__
(
)
+(str)
+— repr of the job</>cache
(
)
+
+— write signature to signature file</>clean
(
retry
)
+
+— Clean up the meta files</>log
(
level
, msg
, *args
, limit
, limit_indicator
, logger
)
+
+— Log message for the jobs</>prepare
(
proc
)
+
+— Prepare the job by given process</>shebang
(
scheduler
)
+(str)
+— Make the shebang with options</>wrapped_script
(
scheduler
)
+(PathLike)
+— Get the wrapped script</>abc.
ABCMeta
(
name
, bases
, namespace
, **kwargs
)
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed +directly, and then acts as a mix-in class. You can also register +unrelated concrete classes (even built-in classes) and unrelated +ABCs as 'virtual subclasses' -- these and their descendants will +be considered subclasses of the registering ABC by the built-in +issubclass() function, but the registering ABC won't show up in +their MRO (Method Resolution Order) nor will method +implementations defined by the registering ABC be callable (not +even via super()).
+__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
)
→ strrepr of the job
clean
(
retry=False
)
Clean up the meta files
retry
+(optional)
+— Whether clean it for retryingwrapped_script
(
scheduler
)
Get the wrapped script
scheduler
+(Scheduler)
+— The schedulerThe path of the wrapped script
log
(
level
, msg
, *args
, limit=3
, limit_indicator=True
, logger=<LoggerAdapter pipen.core (WARNING)>
)
Log message for the jobs
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelimit
+(int, optional)
+— limitation of the log (don't log for all jobs)limit_indicator
+(bool, optional)
+— Whether to show an indicator saying the loghas been limited (the level of the indicator will be DEBUG)
+logger
+(LoggerAdapter, optional)
+— The logger used to logshebang
(
scheduler
)
Make the shebang with options
scheduler
+(Scheduler)
+— The schedulerThe shebang with options
pipen.scheduler.
SgeScheduler
(
*args
, **kwargs
)
SGE scheduler
**kwargs
+
+— Other arguments for the schedulerjob_class
+
+— The job classname
+
+— The name of the schedulerjob_is_running
(
job
)
+(bool)
+— Tell if a job is really running, not only the job.jid_file</>job_is_submitted_or_running
(
job
)
+(bool)
+— Check if a job is already submitted or running</>kill_job
(
job
)
+
+— Kill a job on SGE</>kill_job_and_update_status
(
job
)
+
+— Kill a job and update its status</>kill_running_jobs
(
jobs
)
+
+— Try to kill all running jobs</>polling_jobs
(
jobs
, on
, halt_on_error
)
+(bool)
+— Check if all jobs are done or new jobs can submit</>retry_job
(
job
)
+
+— Retry a job</>submit_job
(
job
)
+(str)
+— Submit a job to SGE</>submit_job_and_update_status
(
job
)
+
+— Submit and update the status</>submit_job_and_update_status
(
job
)
Submit and update the status
job
+(Job)
+— The jobretry_job
(
job
)
Retry a job
job
+(Job)
+— The jobkill_job_and_update_status
(
job
)
Kill a job and update its status
job
+(Job)
+— The jobpolling_jobs
(
jobs
, on
, halt_on_error
)
Check if all jobs are done or new jobs can submit
jobs
+(List)
+— The list of jobson
+(str)
+— query on status: can_submit
or all_done
halt_on_error
+(bool)
+— Whether we should halt the whole pipeline on errorTrue if yes otherwise False.
kill_running_jobs
(
jobs
)
Try to kill all running jobs
jobs
+(List)
+— The list of jobsjob_is_submitted_or_running
(
job
)
Check if a job is already submitted or running
job
+(Job)
+— The jobTrue if yes otherwise False.
submit_job
(
job
)
Submit a job to SGE
job
+(Job)
+— The jobThe job id
kill_job
(
job
)
Kill a job on SGE
job
+(Job)
+— The jobjob_is_running
(
job
)
Tell if a job is really running, not only the job.jid_file
In case where the jid file is not cleaned when job is done.
+job
+(Job)
+— The jobTrue if it is, otherwise False
pipen.scheduler.
SgeJob
(
*args
, **kwargs
)
Job class for SGE scheduler
cached
+
+— Check if a job is cached</>jid
+(int | str | none)
+— Get the jid of the job in scheduler system</>jid_file
+(Path)
+— The jid file of the job</>rc
+(int)
+— The return code of the job</>rc_file
+(Path)
+— The rc file of the job</>retry_dir
+(Path)
+— The retry directory of the job</>script_file
+
+— Get the path to script file</>signature_file
+
+— Get the path to the signature file</>status
+(int)
+— Query the status of the jobstatus_file
+(Path)
+— The status file of the job</>stderr_file
+(Path)
+— The stderr file of the job</>stdout_file
+(Path)
+— The stdout file of the job</>strcmd
+(str)
+— Get the string representation of the command</>__repr__
(
)
+(str)
+— repr of the job</>cache
(
)
+
+— write signature to signature file</>clean
(
retry
)
+
+— Clean up the meta files</>log
(
level
, msg
, *args
, limit
, limit_indicator
, logger
)
+
+— Log message for the jobs</>prepare
(
proc
)
+
+— Prepare the job by given process</>shebang
(
scheduler
)
+(str)
+— Make the shebang with options</>wrapped_script
(
scheduler
)
+(PathLike)
+— Get the wrapped script</>abc.
ABCMeta
(
name
, bases
, namespace
, **kwargs
)
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed +directly, and then acts as a mix-in class. You can also register +unrelated concrete classes (even built-in classes) and unrelated +ABCs as 'virtual subclasses' -- these and their descendants will +be considered subclasses of the registering ABC by the built-in +issubclass() function, but the registering ABC won't show up in +their MRO (Method Resolution Order) nor will method +implementations defined by the registering ABC be callable (not +even via super()).
+__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
)
→ strrepr of the job
clean
(
retry=False
)
Clean up the meta files
retry
+(optional)
+— Whether clean it for retryingwrapped_script
(
scheduler
)
Get the wrapped script
scheduler
+(Scheduler)
+— The schedulerThe path of the wrapped script
Log message for the jobs
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelimit
+(int, optional)
+— limitation of the log (don't log for all jobs)limit_indicator
+(bool, optional)
+— Whether to show an indicator saying the loghas been limited (the level of the indicator will be DEBUG)
+logger
+(LoggerAdapter, optional)
+— The logger used to logshebang
(
scheduler
)
Make the shebang with options
scheduler
+(Scheduler)
+— The schedulerThe shebang with options
pipen.scheduler.
SlurmJob
(
*args
, **kwargs
)
Job class for Slurm scheduler
CMD_WRAPPER_SHELL
+
+— The shell to run the wrapped scriptCMD_WRAPPER_TEMPLATE
+
+— The template for job wrapping_error_retry
+
+— Whether we should retry if error happened_num_retries
+
+— Total number of retries_rc
+
+— The return code of the job_status
+
+— The status of the job_wrapped_cmd
+
+— The wrapped cmd, used for job submissioncached
+
+— Check if a job is cached</>cmd
+
+— The commandhook_done
+
+— Mark whether hooks have already been. Since we don't havea trigger for job finished/failed, so we do a polling on it. This
+is to avoid calling the hooks repeatedly
+index
+
+— The index of the jobjid
+
+— The jid of the job in scheduler systemjid
+(int | str | none)
+— Get the jid of the job in scheduler system</>jid_file
+(Path)
+— The jid file of the job</>metadir
+
+— The metadir of the jobrc
+(int)
+— The return code of the job</>rc_file
+(Path)
+— The rc file of the job</>retry_dir
+(Path)
+— The retry directory of the job</>script_file
+
+— Get the path to script file</>signature_file
+
+— Get the path to the signature file</>status
+(int)
+— Query the status of the jobstatus_file
+(Path)
+— The status file of the job</>stderr_file
+(Path)
+— The stderr file of the job</>stdout_file
+(Path)
+— The stdout file of the job</>strcmd
+(str)
+— Get the string representation of the command</>trial_count
+
+— The count for re-tries__repr__
(
)
+(str)
+— repr of the job</>cache
(
)
+
+— write signature to signature file</>clean
(
retry
)
+
+— Clean up the meta files</>log
(
level
, msg
, *args
, limit
, limit_indicator
, logger
)
+
+— Log message for the jobs</>prepare
(
proc
)
+
+— Prepare the job by given process</>shebang
(
scheduler
)
+(str)
+— Make the shebang with options</>wrapped_script
(
scheduler
)
+(PathLike)
+— Get the wrapped script</>abc.
ABCMeta
(
name
, bases
, namespace
, **kwargs
)
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed +directly, and then acts as a mix-in class. You can also register +unrelated concrete classes (even built-in classes) and unrelated +ABCs as 'virtual subclasses' -- these and their descendants will +be considered subclasses of the registering ABC by the built-in +issubclass() function, but the registering ABC won't show up in +their MRO (Method Resolution Order) nor will method +implementations defined by the registering ABC be callable (not +even via super()).
+__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
)
→ strrepr of the job
clean
(
retry=False
)
Clean up the meta files
retry
+(optional)
+— Whether clean it for retryingwrapped_script
(
scheduler
)
Get the wrapped script
scheduler
+(Scheduler)
+— The schedulerThe path of the wrapped script
log
(
level
, msg
, *args
, limit=3
, limit_indicator=True
, logger=<LoggerAdapter pipen.core (WARNING)>
)
Log message for the jobs
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelimit
+(int, optional)
+— limitation of the log (don't log for all jobs)limit_indicator
+(bool, optional)
+— Whether to show an indicator saying the loghas been limited (the level of the indicator will be DEBUG)
+logger
+(LoggerAdapter, optional)
+— The logger used to logshebang
(
scheduler
)
Make the shebang with options
scheduler
+(Scheduler)
+— The schedulerThe shebang with options
pipen.scheduler.
SlurmScheduler
(
*args
, **kwargs
)
Slurm scheduler
**kwargs
+
+— Other arguments for the schedulerjob_class
+
+— The job classname
+
+— The name of the schedulerjob_is_running
(
job
)
+(bool)
+— Tell if a job is really running, not only the job.jid_file</>job_is_submitted_or_running
(
job
)
+(bool)
+— Check if a job is already submitted or running</>kill_job
(
job
)
+
+— Kill a job on Slurm</>kill_job_and_update_status
(
job
)
+
+— Kill a job and update its status</>kill_running_jobs
(
jobs
)
+
+— Try to kill all running jobs</>polling_jobs
(
jobs
, on
, halt_on_error
)
+(bool)
+— Check if all jobs are done or new jobs can submit</>retry_job
(
job
)
+
+— Retry a job</>submit_job
(
job
)
+(str)
+— Submit a job to Slurm</>submit_job_and_update_status
(
job
)
+
+— Submit and update the status</>submit_job_and_update_status
(
job
)
Submit and update the status
job
+(Job)
+— The jobretry_job
(
job
)
Retry a job
job
+(Job)
+— The jobkill_job_and_update_status
(
job
)
Kill a job and update its status
job
+(Job)
+— The jobpolling_jobs
(
jobs
, on
, halt_on_error
)
Check if all jobs are done or new jobs can submit
jobs
+(List)
+— The list of jobson
+(str)
+— query on status: can_submit
or all_done
halt_on_error
+(bool)
+— Whether we should halt the whole pipeline on errorTrue if yes otherwise False.
kill_running_jobs
(
jobs
)
Try to kill all running jobs
jobs
+(List)
+— The list of jobsjob_is_submitted_or_running
(
job
)
Check if a job is already submitted or running
job
+(Job)
+— The jobTrue if yes otherwise False.
submit_job
(
job
)
Submit a job to Slurm
job
+(Job)
+— The jobThe job id
kill_job
(
job
)
Kill a job on Slurm
job
+(Job)
+— The jobjob_is_running
(
job
)
Tell if a job is really running, not only the job.jid_file
In case where the jid file is not cleaned when job is done.
+job
+(Job)
+— The jobTrue if it is, otherwise False
pipen.scheduler.
SlurmJob
(
*args
, **kwargs
)
Job class for Slurm scheduler
cached
+
+— Check if a job is cached</>jid
+(int | str | none)
+— Get the jid of the job in scheduler system</>jid_file
+(Path)
+— The jid file of the job</>rc
+(int)
+— The return code of the job</>rc_file
+(Path)
+— The rc file of the job</>retry_dir
+(Path)
+— The retry directory of the job</>script_file
+
+— Get the path to script file</>signature_file
+
+— Get the path to the signature file</>status
+(int)
+— Query the status of the jobstatus_file
+(Path)
+— The status file of the job</>stderr_file
+(Path)
+— The stderr file of the job</>stdout_file
+(Path)
+— The stdout file of the job</>strcmd
+(str)
+— Get the string representation of the command</>__repr__
(
)
+(str)
+— repr of the job</>cache
(
)
+
+— write signature to signature file</>clean
(
retry
)
+
+— Clean up the meta files</>log
(
level
, msg
, *args
, limit
, limit_indicator
, logger
)
+
+— Log message for the jobs</>prepare
(
proc
)
+
+— Prepare the job by given process</>shebang
(
scheduler
)
+(str)
+— Make the shebang with options</>wrapped_script
(
scheduler
)
+(PathLike)
+— Get the wrapped script</>abc.
ABCMeta
(
name
, bases
, namespace
, **kwargs
)
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed +directly, and then acts as a mix-in class. You can also register +unrelated concrete classes (even built-in classes) and unrelated +ABCs as 'virtual subclasses' -- these and their descendants will +be considered subclasses of the registering ABC by the built-in +issubclass() function, but the registering ABC won't show up in +their MRO (Method Resolution Order) nor will method +implementations defined by the registering ABC be callable (not +even via super()).
+__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
)
→ strrepr of the job
clean
(
retry=False
)
Clean up the meta files
retry
+(optional)
+— Whether clean it for retryingwrapped_script
(
scheduler
)
Get the wrapped script
scheduler
+(Scheduler)
+— The schedulerThe path of the wrapped script
Log message for the jobs
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelimit
+(int, optional)
+— limitation of the log (don't log for all jobs)limit_indicator
+(bool, optional)
+— Whether to show an indicator saying the loghas been limited (the level of the indicator will be DEBUG)
+logger
+(LoggerAdapter, optional)
+— The logger used to logshebang
(
scheduler
)
Make the shebang with options
scheduler
+(Scheduler)
+— The schedulerThe shebang with options
pipen.scheduler.
SshJob
(
*args
, **kwargs
)
Job class for SSH scheduler
CMD_WRAPPER_SHELL
+
+— The shell to run the wrapped scriptCMD_WRAPPER_TEMPLATE
+
+— The template for job wrapping_error_retry
+
+— Whether we should retry if error happened_num_retries
+
+— Total number of retries_rc
+
+— The return code of the job_status
+
+— The status of the job_wrapped_cmd
+
+— The wrapped cmd, used for job submissioncached
+
+— Check if a job is cached</>cmd
+
+— The commandhook_done
+
+— Mark whether hooks have already been. Since we don't havea trigger for job finished/failed, so we do a polling on it. This
+is to avoid calling the hooks repeatedly
+index
+
+— The index of the jobjid
+
+— The jid of the job in scheduler systemjid
+(int | str | none)
+— Get the jid of the job in scheduler system</>jid_file
+(Path)
+— The jid file of the job</>metadir
+
+— The metadir of the jobrc
+(int)
+— The return code of the job</>rc_file
+(Path)
+— The rc file of the job</>retry_dir
+(Path)
+— The retry directory of the job</>script_file
+
+— Get the path to script file</>signature_file
+
+— Get the path to the signature file</>status
+(int)
+— Query the status of the jobstatus_file
+(Path)
+— The status file of the job</>stderr_file
+(Path)
+— The stderr file of the job</>stdout_file
+(Path)
+— The stdout file of the job</>strcmd
+(str)
+— Get the string representation of the command</>trial_count
+
+— The count for re-tries__repr__
(
)
+(str)
+— repr of the job</>cache
(
)
+
+— write signature to signature file</>clean
(
retry
)
+
+— Clean up the meta files</>log
(
level
, msg
, *args
, limit
, limit_indicator
, logger
)
+
+— Log message for the jobs</>prepare
(
proc
)
+
+— Prepare the job by given process</>shebang
(
scheduler
)
+(str)
+— The shebang of the wrapped script</>wrapped_script
(
scheduler
)
+(PathLike)
+— Get the wrapped script</>abc.
ABCMeta
(
name
, bases
, namespace
, **kwargs
)
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed +directly, and then acts as a mix-in class. You can also register +unrelated concrete classes (even built-in classes) and unrelated +ABCs as 'virtual subclasses' -- these and their descendants will +be considered subclasses of the registering ABC by the built-in +issubclass() function, but the registering ABC won't show up in +their MRO (Method Resolution Order) nor will method +implementations defined by the registering ABC be callable (not +even via super()).
+__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
)
→ strrepr of the job
shebang
(
scheduler
)
→ strThe shebang of the wrapped script
clean
(
retry=False
)
Clean up the meta files
retry
+(optional)
+— Whether clean it for retryingwrapped_script
(
scheduler
)
Get the wrapped script
scheduler
+(Scheduler)
+— The schedulerThe path of the wrapped script
log
(
level
, msg
, *args
, limit=3
, limit_indicator=True
, logger=<LoggerAdapter pipen.core (WARNING)>
)
Log message for the jobs
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelimit
+(int, optional)
+— limitation of the log (don't log for all jobs)limit_indicator
+(bool, optional)
+— Whether to show an indicator saying the loghas been limited (the level of the indicator will be DEBUG)
+logger
+(LoggerAdapter, optional)
+— The logger used to logpipen.scheduler.
SshScheduler
(
forks
, prescript=''
, postscript=''
, **kwargs
)
SSH scheduler
forks
+(int)
+— Max number of job forksprescript
+(str, optional)
+— The script to run before the commandpostscript
+(str, optional)
+— The script to run after the command**kwargs
+
+— Other arguments for the schedulerjob_class
+
+— The job classname
+
+— The name of the schedulerjob_is_running
(
job
)
+(bool)
+— Tell if a job is really running, not only the job.jid_file</>job_is_submitted_or_running
(
job
)
+(bool)
+— Check if a job is already submitted or running</>kill_job
(
job
)
+
+— Kill a job on SSH</>kill_job_and_update_status
(
job
)
+
+— Kill a job and update its status</>kill_running_jobs
(
jobs
)
+
+— Try to kill all running jobs</>polling_jobs
(
jobs
, on
, halt_on_error
)
+(bool)
+— Check if all jobs are done or new jobs can submit</>retry_job
(
job
)
+
+— Retry a job</>submit_job
(
job
)
+(str)
+— Submit a job to SSH</>submit_job_and_update_status
(
job
)
+
+— Submit and update the status</>submit_job_and_update_status
(
job
)
Submit and update the status
job
+(Job)
+— The jobretry_job
(
job
)
Retry a job
job
+(Job)
+— The jobkill_job_and_update_status
(
job
)
Kill a job and update its status
job
+(Job)
+— The jobpolling_jobs
(
jobs
, on
, halt_on_error
)
Check if all jobs are done or new jobs can submit
jobs
+(List)
+— The list of jobson
+(str)
+— query on status: can_submit
or all_done
halt_on_error
+(bool)
+— Whether we should halt the whole pipeline on errorTrue if yes otherwise False.
kill_running_jobs
(
jobs
)
Try to kill all running jobs
jobs
+(List)
+— The list of jobsjob_is_submitted_or_running
(
job
)
Check if a job is already submitted or running
job
+(Job)
+— The jobTrue if yes otherwise False.
submit_job
(
job
)
Submit a job to SSH
job
+(Job)
+— The jobThe job id
kill_job
(
job
)
Kill a job on SSH
job
+(Job)
+— The jobjob_is_running
(
job
)
Tell if a job is really running, not only the job.jid_file
In case where the jid file is not cleaned when job is done.
+job
+(Job)
+— The jobTrue if it is, otherwise False
pipen.scheduler.
SshJob
(
*args
, **kwargs
)
Job class for SSH scheduler
cached
+
+— Check if a job is cached</>jid
+(int | str | none)
+— Get the jid of the job in scheduler system</>jid_file
+(Path)
+— The jid file of the job</>rc
+(int)
+— The return code of the job</>rc_file
+(Path)
+— The rc file of the job</>retry_dir
+(Path)
+— The retry directory of the job</>script_file
+
+— Get the path to script file</>signature_file
+
+— Get the path to the signature file</>status
+(int)
+— Query the status of the jobstatus_file
+(Path)
+— The status file of the job</>stderr_file
+(Path)
+— The stderr file of the job</>stdout_file
+(Path)
+— The stdout file of the job</>strcmd
+(str)
+— Get the string representation of the command</>__repr__
(
)
+(str)
+— repr of the job</>cache
(
)
+
+— write signature to signature file</>clean
(
retry
)
+
+— Clean up the meta files</>log
(
level
, msg
, *args
, limit
, limit_indicator
, logger
)
+
+— Log message for the jobs</>prepare
(
proc
)
+
+— Prepare the job by given process</>shebang
(
scheduler
)
+(str)
+— The shebang of the wrapped script</>wrapped_script
(
scheduler
)
+(PathLike)
+— Get the wrapped script</>abc.
ABCMeta
(
name
, bases
, namespace
, **kwargs
)
Metaclass for defining Abstract Base Classes (ABCs).
Use this metaclass to create an ABC. An ABC can be subclassed +directly, and then acts as a mix-in class. You can also register +unrelated concrete classes (even built-in classes) and unrelated +ABCs as 'virtual subclasses' -- these and their descendants will +be considered subclasses of the registering ABC by the built-in +issubclass() function, but the registering ABC won't show up in +their MRO (Method Resolution Order) nor will method +implementations defined by the registering ABC be callable (not +even via super()).
+__instancecheck__
(
cls
, instance
)
+
+— Override for isinstance(instance, cls).</>__subclasscheck__
(
cls
, subclass
)
+
+— Override for issubclass(subclass, cls).</>register
(
cls
, subclass
)
+
+— Register a virtual subclass of an ABC.</>register
(
cls
, subclass
)
Register a virtual subclass of an ABC.
Returns the subclass, to allow usage as a class decorator.
+__instancecheck__
(
cls
, instance
)
Override for isinstance(instance, cls).
__subclasscheck__
(
cls
, subclass
)
Override for issubclass(subclass, cls).
__repr__
(
)
→ strrepr of the job
shebang
(
scheduler
)
→ strThe shebang of the wrapped script
clean
(
retry=False
)
Clean up the meta files
retry
+(optional)
+— Whether clean it for retryingwrapped_script
(
scheduler
)
Get the wrapped script
scheduler
+(Scheduler)
+— The schedulerThe path of the wrapped script
Log message for the jobs
level
+(int | str)
+— The log level of the recordmsg
+(str)
+— The message to log*args
+
+— The arguments to format the messagelimit
+(int, optional)
+— limitation of the log (don't log for all jobs)limit_indicator
+(bool, optional)
+— Whether to show an indicator saying the loghas been limited (the level of the indicator will be DEBUG)
+logger
+(LoggerAdapter, optional)
+— The logger used to logpipen.scheduler.
get_scheduler
(
scheduler
)
Get the scheduler by name of the scheduler class itself
scheduler
+(Union)
+— The scheduler class or nameThe scheduler class
Template adaptor for pipen
Template
(
source
, **kwargs
)
+
+— Base class wrapper to wrap template for pipen</>TemplateLiquid
+
+— Liquidpy template wrapper.</>TemplateJinja2
+
+— Jinja2 template wrapper</>get_template_engine
(
template
)
+(Type)
+— Get the template engine by name or the template engine itself</>pipen.template.
Template
(
source
, **kwargs
)
Base class wrapper to wrap template for pipen
render
(
data=None
)
→ strRender the template@parmas: + data (dict): The data used to render
+pipen.template.
TemplateLiquid
(
source
, **kwargs
)
Liquidpy template wrapper.
render
(
data=None
)
→ strRender the template@parmas: + data (dict): The data used to render
+pipen.template.
TemplateJinja2
(
source
, **kwargs
)
Jinja2 template wrapper
render
(
data=None
)
→ strRender the template@parmas: + data (dict): The data used to render
+pipen.template.
get_template_engine
(
template
)
Get the template engine by name or the template engine itself
template
+(Union)
+— The name of the template engine or the template engine itselfThe template engine
Provide some utilities
RichHandler
+
+— Subclass of rich.logging.RichHandler, showing log levels as a singlecharacter
+</>RichConsole
+
+— A high level console interface.</>brief_list
(
blist
)
+(str)
+— Briefly show an integer list, combine the continuous numbers.</>copy_dict
(
dic
, depth
)
+(Mapping)
+— Deep copy a dict</>desc_from_docstring
(
obj
, base
)
+(str)
+— Get the description from docstring</>get_base
(
klass
, abc_base
, value
, value_getter
)
+(Type)
+— Get the base class where the value was first defined</>get_logger
(
name
, level
)
+(LoggerAdapter)
+— Get the logger by given plugin name</>get_logpanel_width
(
)
+(int)
+— Get the width of the log content</>get_marked
(
cls
, mark_name
, default
)
+(Any)
+— Get the marked value from a proc</>get_mtime
(
path
, dir_depth
)
+(float)
+— Get the modification time of a path.If path is a directory, try to get the last modification time of the
+contents in the directory at given dir_depth
+</>get_shebang
(
script
)
+(str)
+— Get the shebang of the script</>ignore_firstline_dedent
(
text
)
+(str)
+— Like textwrap.dedent(), but ignore first empty lines</>is_loading_pipeline
(
*flags
, argv
)
+(bool)
+— Check if we are loading the pipeline. Works only whenargv0
is "@pipen" while loading the pipeline.
+</>is_subclass
(
obj
, cls
)
+(bool)
+— Tell if obj is a subclass of clsDifferences with issubclass is that we don't raise Type error if obj
+is not a class
+</>is_valid_name
(
name
)
+(bool)
+— Check if a name is valid for a proc or pipen</>load_entrypoints
(
group
)
+(Iterable)
+— Load objects from setuptools entrypoints by given group name</>load_pipeline
(
obj
, argv0
, argv1p
, **kwargs
)
+(Pipen)
+— Load a pipeline from a Pipen, Proc or ProcGroup object</>log_rich_renderable
(
renderable
, color
, logfunc
, *args
, **kwargs
)
+
+— Log a rich renderable to logger</>make_df_colnames_unique_inplace
(
thedf
)
+
+— Make the columns of a data frame unique</>mark
(
**kwargs
)
+(Callable)
+— Mark a class (e.g. Proc) with given kwargs as metadata</>pipen_banner
(
)
+(RenderableType)
+— The banner for pipen</>strsplit
(
string
, sep
, maxsplit
, trim
)
+(List)
+— Split the string, with the ability to trim each part.</>truncate_text
(
text
, width
, end
)
+(str)
+— Truncate a text not based on words/whitespacesOtherwise, we could use textwrap.shorten.
+</>update_dict
(
parent
, new
, depth
)
+(Mapping)
+— Update the new dict to the parent, but make sure parent does not change</>pipen.utils.
RichHandler
(
level=0
, console=None
, show_time=True
, omit_repeated_times=True
, show_level=True
, show_path=True
, enable_link_path=True
, highlighter=None
, markup=False
, rich_tracebacks=False
, tracebacks_width=None
, tracebacks_code_width=88
, tracebacks_extra_lines=3
, tracebacks_theme=None
, tracebacks_word_wrap=True
, tracebacks_show_locals=False
, tracebacks_suppress=()
, tracebacks_max_frames=100
, locals_max_length=10
, locals_max_string=80
, log_time_format='[%x %X]'
, keywords=None
)
Subclass of rich.logging.RichHandler, showing log levels as a singlecharacter
+level
+(Union, optional)
+— Log level. Defaults to logging.NOTSET.show_time
+(bool, optional)
+— Show a column for the time. Defaults to True.omit_repeated_times
+(bool, optional)
+— Omit repetition of the same time. Defaults to True.show_level
+(bool, optional)
+— Show a column for the level. Defaults to True.show_path
+(bool, optional)
+— Show the path to the original log call. Defaults to True.enable_link_path
+(bool, optional)
+— Enable terminal link of path column to file. Defaults to True.highlighter
+(Optional, optional)
+— Highlighter to style log messages, or None to use ReprHighlighter. Defaults to None.markup
+(bool, optional)
+— Enable console markup in log messages. Defaults to False.rich_tracebacks
+(bool, optional)
+— Enable rich tracebacks with syntax highlighting and formatting. Defaults to False.tracebacks_width
+(Optional, optional)
+— Number of characters used to render tracebacks, or None for full width. Defaults to None.tracebacks_code_width
+(int, optional)
+— Number of code characters used to render tracebacks, or None for full width. Defaults to 88.tracebacks_extra_lines
+(int, optional)
+— Additional lines of code to render tracebacks, or None for full width. Defaults to None.tracebacks_theme
+(Optional, optional)
+— Override pygments theme used in traceback.tracebacks_word_wrap
+(bool, optional)
+— Enable word wrapping of long tracebacks lines. Defaults to True.tracebacks_show_locals
+(bool, optional)
+— Enable display of locals in tracebacks. Defaults to False.tracebacks_suppress
+(Iterable, optional)
+— Optional sequence of modules or paths to exclude from traceback.tracebacks_max_frames
+(int, optional)
+— Optional maximum number of frames returned by traceback.locals_max_length
+(int, optional)
+— Maximum length of containers before abbreviating, or None for no abbreviation.Defaults to 10.
+locals_max_string
+(int, optional)
+— Maximum length of string before truncating, or None to disable. Defaults to 80.log_time_format
+(Union, optional)
+— If log_time
is enabled, either string for strftime or callable that formats the time. Defaults to "[%x %X] ".keywords
+(Optional, optional)
+— List of words to highlight instead of RichHandler.KEYWORDS
.acquire
(
)
+
+— Acquire the I/O thread lock.</>addFilter
(
filter
)
+
+— Add the specified filter to this handler.</>close
(
)
+
+— Tidy up any resources used by the handler.</>createLock
(
)
+
+— Acquire a thread lock for serializing access to the underlying I/O.</>emit
(
record
)
+
+— Invoked by logging.</>filter
(
record
)
+
+— Determine if a record is loggable by consulting all the filters.</>flush
(
)
+
+— Ensure all logging output has been flushed.</>format
(
record
)
+
+— Format the specified record.</>get_level_text
(
record
)
+(Text)
+— Get the level name from the record.</>handle
(
record
)
+
+— Conditionally emit the specified logging record.</>handleError
(
record
)
+
+— Handle errors which occur during an emit() call.</>release
(
)
+
+— Release the I/O thread lock.</>removeFilter
(
filter
)
+
+— Remove the specified filter from this handler.</>render
(
record
, traceback
, message_renderable
)
+(ConsoleRenderable)
+— Render log for display.</>render_message
(
record
, message
)
+(ConsoleRenderable)
+— Render message text in to Text.</>setFormatter
(
fmt
)
+
+— Set the formatter for this handler.</>setLevel
(
level
)
+
+— Set the logging level of this handler. level must be an int or a str.</>addFilter
(
filter
)
Add the specified filter to this handler.
removeFilter
(
filter
)
Remove the specified filter from this handler.
filter
(
record
)
Determine if a record is loggable by consulting all the filters.
The default is to allow the record to be logged; any filter can veto +this and the record is then dropped. Returns a zero value if a record +is to be dropped, else non-zero.
+.. versionchanged:: 3.2
+Allow filters to be just callables.
+createLock
(
)
Acquire a thread lock for serializing access to the underlying I/O.
acquire
(
)
Acquire the I/O thread lock.
release
(
)
Release the I/O thread lock.
setLevel
(
level
)
Set the logging level of this handler. level must be an int or a str.
format
(
record
)
Format the specified record.
If a formatter is set, use it. Otherwise, use the default formatter +for the module.
+handle
(
record
)
Conditionally emit the specified logging record.
Emission depends on filters which may have been added to the handler. +Wrap the actual emission of the record with acquisition/release of +the I/O thread lock. Returns whether the filter passed the record for +emission.
+setFormatter
(
fmt
)
Set the formatter for this handler.
flush
(
)
Ensure all logging output has been flushed.
This version does nothing and is intended to be implemented by +subclasses.
+close
(
)
Tidy up any resources used by the handler.
This version removes the handler from an internal map of handlers, +_handlers, which is used for handler lookup by name. Subclasses +should ensure that this gets called from overridden close() +methods.
+handleError
(
record
)
Handle errors which occur during an emit() call.
This method should be called from handlers when an exception is +encountered during an emit() call. If raiseExceptions is false, +exceptions get silently ignored. This is what is mostly wanted +for a logging system - most users will not care about errors in +the logging system, they are more interested in application errors. +You could, however, replace this with a custom handler if you wish. +The record which was being processed is passed in to this method.
+emit
(
record
)
Invoked by logging.
render_message
(
record
, message
)
Render message text in to Text.
record
+(LogRecord)
+— logging Record.message
+(str)
+— String containing log message.Renderable to display log message.
render
(
record
, traceback
, message_renderable
)
Render log for display.
record
+(LogRecord)
+— logging Record.traceback
+(Optional[Traceback])
+— Traceback instance or None for no Traceback.message_renderable
+(ConsoleRenderable)
+— Renderable (typically Text) containing log message contents.Renderable to display log.
get_level_text
(
record
)
Get the level name from the record.
record
+(LogRecord)
+— LogRecord instance.A tuple of the style and level name.
pipen.utils.
RichConsole
(
*args
, **kwargs
)
A high level console interface.
color_system
+
+— Get color system string.</>encoding
+
+— Get the encoding of the console file, e.g. "utf-8"
.</>file
+(IO)
+— Get the file object to write to.</>height
+
+— Get the height of the console.</>is_alt_screen
+
+— Check if the alt screen was enabled.</>is_dumb_terminal
+
+— Detect dumb terminal.</>is_terminal
+
+— Check if the console is writing to a terminal.</>options
+(ConsoleOptions)
+— Get default console options.</>size
+
+— Get the size of the console.</>width
+
+— Get the width of the console.</>__enter__
(
)
+(Console)
+— Own context manager to enter buffer context.</>__exit__
(
exc_type
, exc_value
, traceback
)
+
+— Exit buffer context.</>begin_capture
(
)
+
+— Begin capturing console output. Call :meth:end_capture
to exit capture mode and return output.</>bell
(
)
+
+— Play a 'bell' sound (if supported by the terminal).</>capture
(
)
+(Capture)
+— A context manager to capture the result of print() or log() in a string,rather than writing it to the console.
+</>clear
(
home
)
+
+— Clear the screen.</>clear_live
(
)
+
+— Clear the Live instance.</>control
(
*control
)
+
+— Insert non-printing control codes.</>end_capture
(
)
+(str)
+— End capture mode and return captured string.</>export_html
(
theme
, clear
, code_format
, inline_styles
)
+(str)
+— Generate HTML from console contents (requires record=True argument in constructor).</>export_svg
(
title
, theme
, clear
, code_format
, font_aspect_ratio
, unique_id
)
+(str)
+— Generate an SVG from the console contents (requires record=True in Console constructor).</>export_text
(
clear
, styles
)
+(str)
+— Generate text from console contents (requires record=True argument in constructor).</>get_style
(
name
, default
)
+(Style)
+— Get a Style instance by its theme name or parse a definition.</>input
(
prompt
, markup
, emoji
, password
, stream
)
+(str)
+— Displays a prompt and waits for input from the user. The prompt may contain color / style.</>line
(
count
)
+
+— Write new line(s).</>log
(
*objects
, sep
, end
, style
, justify
, emoji
, markup
, highlight
, log_locals
, _stack_offset
)
+
+— Log rich content to the terminal.</>measure
(
renderable
, options
)
+(Measurement)
+— Measure a renderable. Returns a :class:~rich.measure.Measurement
object which containsinformation regarding the number of characters required to print the renderable.
+</>on_broken_pipe
(
)
+
+— This function is called when a BrokenPipeError
is raised.</>out
(
*objects
, sep
, end
, style
, highlight
)
+
+— Output to the terminal. This is a low-level way of writing to the terminal which unlike:meth:~rich.console.Console.print
won't pretty print, wrap text, or apply markup, but will
+optionally apply highlighting and a basic style.
+</>pager
(
pager
, styles
, links
)
+(PagerContext)
+— A context manager to display anything printed within a "pager". The pager applicationis defined by the system and will typically support at least pressing a key to scroll.
+</>pop_render_hook
(
)
+
+— Pop the last renderhook from the stack.</>pop_theme
(
)
+
+— Remove theme from top of stack, restoring previous theme.</>print
(
*objects
, sep
, end
, style
, justify
, overflow
, no_wrap
, emoji
, markup
, highlight
, width
, height
, crop
, soft_wrap
, new_line_start
)
+
+— Print to the console.</>print_exception
(
width
, extra_lines
, theme
, word_wrap
, show_locals
, suppress
, max_frames
)
+
+— Prints a rich render of the last exception and traceback.</>print_json
(
json
, data
, indent
, highlight
, skip_keys
, ensure_ascii
, check_circular
, allow_nan
, default
, sort_keys
)
+
+— Pretty prints JSON. Output will be valid JSON.</>push_render_hook
(
hook
)
+
+— Add a new render hook to the stack.</>push_theme
(
theme
, inherit
)
+
+— Push a new theme on to the top of the stack, replacing the styles from the previous theme.Generally speaking, you should call :meth:~rich.console.Console.use_theme
to get a context manager, rather
+than calling this method directly.
+</>render
(
renderable
, options
)
+(Iterable[Segment])
+— Render an object in to an iterable of Segment
instances.</>render_lines
(
renderable
, options
, style
, pad
, new_lines
)
+(List)
+— Render objects in to a list of lines.</>render_str
(
text
, style
, justify
, overflow
, emoji
, markup
, highlight
, highlighter
)
+(ConsoleRenderable)
+— Convert a string to a Text instance. This is called automatically ifyou print or log a string.
+</>rule
(
title
, characters
, style
, align
)
+
+— Draw a line with optional centered title.</>save_html
(
path
, theme
, clear
, code_format
, inline_styles
)
+
+— Generate HTML from console contents and write to a file (requires record=True argument in constructor).</>save_svg
(
path
, title
, theme
, clear
, code_format
, font_aspect_ratio
, unique_id
)
+
+— Generate an SVG file from the console contents (requires record=True in Console constructor).</>save_text
(
path
, clear
, styles
)
+
+— Generate text from console and save to a given location (requires record=True argument in constructor).</>screen
(
hide_cursor
, style
)
+(~ScreenContext)
+— Context manager to enable and disable 'alternative screen' mode.</>set_alt_screen
(
enable
)
+(bool)
+— Enables alternative screen mode.</>set_live
(
live
)
+
+— Set Live instance. Used by Live context manager.</>set_window_title
(
title
)
+(bool)
+— Set the title of the console terminal window.</>show_cursor
(
show
)
+(bool)
+— Show or hide the cursor.</>status
(
status
, spinner
, spinner_style
, speed
, refresh_per_second
)
+(Status)
+— Display a status and spinner.</>update_screen
(
renderable
, region
, options
)
+
+— Update the screen at a given offset.</>update_screen_lines
(
lines
, x
, y
)
+
+— Update lines of the screen at a given offset.</>use_theme
(
theme
, inherit
)
+(ThemeContext)
+— Use a different theme for the duration of the context manager.</>set_live
(
live
)
Set Live instance. Used by Live context manager.
live
+(Live)
+— Live instance using this Console.errors.LiveError
+
+— If this Console has a Live context currently active.clear_live
(
)
Clear the Live instance.
push_render_hook
(
hook
)
Add a new render hook to the stack.
hook
+(RenderHook)
+— Render hook instance.pop_render_hook
(
)
Pop the last renderhook from the stack.
__enter__
(
)
→ ConsoleOwn context manager to enter buffer context.
__exit__
(
exc_type
, exc_value
, traceback
)
Exit buffer context.
begin_capture
(
)
Begin capturing console output. Call :meth:end_capture
to exit capture mode and return output.
end_capture
(
)
End capture mode and return captured string.
Console output.
push_theme
(
theme
, inherit=True
)
Push a new theme on to the top of the stack, replacing the styles from the previous theme.Generally speaking, you should call :meth:~rich.console.Console.use_theme
to get a context manager, rather
+than calling this method directly.
theme
+(Theme)
+— A theme instance.inherit
+(bool, optional)
+— Inherit existing styles. Defaults to True.pop_theme
(
)
Remove theme from top of stack, restoring previous theme.
use_theme
(
theme
, inherit=True
)
Use a different theme for the duration of the context manager.
theme
+(Theme)
+— Theme instance to user.inherit
+(bool, optional)
+— Inherit existing console styles. Defaults to True.[description]
bell
(
)
Play a 'bell' sound (if supported by the terminal).
capture
(
)
A context manager to capture the result of print() or log() in a string,rather than writing it to the console.
+>>> from rich.console import Console>>> console = Console()
+>>> with console.capture() as capture:
+... console.print("[bold magenta]Hello World[/]")
+>>> print(capture.get())
+
Context manager with disables writing to the terminal.
pager
(
pager=None
, styles=False
, links=False
)
A context manager to display anything printed within a "pager". The pager applicationis defined by the system and will typically support at least pressing a key to scroll.
+pager
+(Pager, optional)
+— A pager object, or None to use :class:~rich.pager.SystemPager
. Defaults to None.styles
+(bool, optional)
+— Show styles in pager. Defaults to False.links
+(bool, optional)
+— Show links in pager. Defaults to False.>>> from rich.console import Console>>> from rich.__main__ import make_test_card
+>>> console = Console()
+>>> with console.pager():
+ console.print(make_test_card())
+
A context manager.
line
(
count=1
)
Write new line(s).
count
+(int, optional)
+— Number of new lines. Defaults to 1.clear
(
home=True
)
Clear the screen.
home
+(bool, optional)
+— Also move the cursor to 'home' position. Defaults to True.status
(
status
, spinner='dots'
, spinner_style='status.spinner'
, speed=1.0
, refresh_per_second=12.5
)
Display a status and spinner.
status
+(RenderableType)
+— A status renderable (str or Text typically).spinner
+(str, optional)
+— Name of spinner animation (see python -m rich.spinner). Defaults to "dots".spinner_style
+(StyleType, optional)
+— Style of spinner. Defaults to "status.spinner".speed
+(float, optional)
+— Speed factor for spinner animation. Defaults to 1.0.refresh_per_second
+(float, optional)
+— Number of refreshes per second. Defaults to 12.5.A Status object that may be used as a context manager.
show_cursor
(
show=True
)
→ boolShow or hide the cursor.
show
+(bool, optional)
+— Set visibility of the cursor.set_alt_screen
(
enable=True
)
Enables alternative screen mode.
Note, if you enable this mode, you should ensure that is disabled before
+the application exits. See :meth:~rich.Console.screen
for a context manager
+that handles this for you.
enable
+(bool, optional)
+— Enable (True) or disable (False) alternate screen. Defaults to True.True if the control codes were written.
set_window_title
(
title
)
Set the title of the console terminal window.
Warning: There is no means within Rich of "resetting" the window title to its +previous value, meaning the title you set will persist even after your application +exits.
+fish
shell resets the window title before and after each command by default,
+negating this issue. Windows Terminal and command prompt will also reset the title for you.
+Most other shells and terminals, however, do not do this.
Some terminals may require configuration changes before you can set the title. +Some terminals may not support setting the title at all.
+Other software (including the terminal itself, the shell, custom prompts, plugins, etc.) +may also set the terminal window title. This could result in whatever value you write +using this method being overwritten.
+title
+(str)
+— The new title of the terminal window.True if the control code to change the terminal title was written, otherwise False. Note that a return value of True + does not guarantee that the window title has actually changed, + since the feature may be unsupported/disabled in some terminals.
+screen
(
hide_cursor=True
, style=None
)
Context manager to enable and disable 'alternative screen' mode.
hide_cursor
+(bool, optional)
+— Also hide the cursor. Defaults to False.style
+(Style, optional)
+— Optional style for screen. Defaults to None.Context which enables alternate screen on enter, and disables it on exit.
measure
(
renderable
, options=None
)
Measure a renderable. Returns a :class:~rich.measure.Measurement
object which containsinformation regarding the number of characters required to print the renderable.
renderable
+(RenderableType)
+— Any renderable or string.options
+(Optional[ConsoleOptions], optional)
+— Options to use when measuring, or Noneto use default options. Defaults to None.
+A measurement of the renderable.
render
(
renderable
, options=None
)
Render an object in to an iterable of Segment
instances.
This method contains the logic for rendering objects with the console protocol. +You are unlikely to need to use it directly, unless you are extending the library.
+renderable
+(RenderableType)
+— An object supporting the console protocol, oran object that may be converted to a string.
+options
+(ConsoleOptions, optional)
+— An options object, or None to use self.options. Defaults to None.An iterable of segments that may be rendered.
render_lines
(
renderable
, options=None
, style=None
, pad=True
, new_lines=False
)
→ ListRender objects in to a list of lines.
The output of render_lines is useful when further formatting of rendered console text
+ is required, such as the Panel class which draws a border around any renderable object.
+
+ Args:
+ renderable (RenderableType): Any object renderable in the console.
+ options (Optional[ConsoleOptions], optional): Console options, or None to use self.options. Default to ``None``.
+ style (Style, optional): Optional style to apply to renderables. Defaults to ``None``.
+ pad (bool, optional): Pad lines shorter than render width. Defaults to ``True``.
+ new_lines (bool, optional): Include "
+
+" characters at end of lines.
+ Returns:
+ List[List[Segment]]: A list of lines, where a line is a list of Segment objects.
+
+render_str
(
text
, style=''
, justify=None
, overflow=None
, emoji=None
, markup=None
, highlight=None
, highlighter=None
)
Convert a string to a Text instance. This is called automatically ifyou print or log a string.
+text
+(str)
+— Text to render.style
+(Union[str, Style], optional)
+— Style to apply to rendered text.justify
+(str, optional)
+— Justify method: "default", "left", "center", "full", or "right". Defaults to None
.overflow
+(str, optional)
+— Overflow method: "crop", "fold", or "ellipsis". Defaults to None
.emoji
+(Optional[bool], optional)
+— Enable emoji, or None
to use Console default.markup
+(Optional[bool], optional)
+— Enable markup, or None
to use Console default.highlight
+(Optional[bool], optional)
+— Enable highlighting, or None
to use Console default.highlighter
+(HighlighterType, optional)
+— Optional highlighter to apply.Renderable object.
get_style
(
name
, default=None
)
Get a Style instance by its theme name or parse a definition.
name
+(str)
+— The name of a style or a style definition.A Style object.
MissingStyle
+
+— If no style could be parsed from name.rule
(
title=''
, characters='─'
, style='rule.line'
, align='center'
)
Draw a line with optional centered title.
title
+(str, optional)
+— Text to render over the rule. Defaults to "".characters
+(str, optional)
+— Character(s) to form the line. Defaults to "─".style
+(str, optional)
+— Style of line. Defaults to "rule.line".align
+(str, optional)
+— How to align the title, one of "left", "center", or "right". Defaults to "center".control
(
*control
)
Insert non-printing control codes.
out
(
*objects
, sep=' '
, end='\n'
, style=None
, highlight=None
)
Output to the terminal. This is a low-level way of writing to the terminal which unlike:meth:~rich.console.Console.print
won't pretty print, wrap text, or apply markup, but will
+optionally apply highlighting and a basic style.
sep
+(str, optional)
+— String to write between print data. Defaults to " ".end
+(str, optional)
+— String to write at end of print data. Defaults to "\n".style
+(Union[str, Style], optional)
+— A style to apply to output. Defaults to None.highlight
+(Optional[bool], optional)
+— Enable automatic highlighting, or None
to useconsole default. Defaults to None
.
+print
(
*objects
, sep=' '
, end='\n'
, style=None
, justify=None
, overflow=None
, no_wrap=None
, emoji=None
, markup=None
, highlight=None
, width=None
, height=None
, crop=True
, soft_wrap=None
, new_line_start=False
)
Print to the console.
sep
+(str, optional)
+— String to write between print data. Defaults to " ".end
+(str, optional)
+— String to write at end of print data. Defaults to "\n".style
+(Union[str, Style], optional)
+— A style to apply to output. Defaults to None.justify
+(str, optional)
+— Justify method: "default", "left", "right", "center", or "full". Defaults to None
.overflow
+(str, optional)
+— Overflow method: "ignore", "crop", "fold", or "ellipsis". Defaults to None.no_wrap
+(Optional[bool], optional)
+— Disable word wrapping. Defaults to None.emoji
+(Optional[bool], optional)
+— Enable emoji code, or None
to use console default. Defaults to None
.markup
+(Optional[bool], optional)
+— Enable markup, or None
to use console default. Defaults to None
.highlight
+(Optional[bool], optional)
+— Enable automatic highlighting, or None
to use console default. Defaults to None
.width
+(Optional[int], optional)
+— Width of output, or None
to auto-detect. Defaults to None
.crop
+(Optional[bool], optional)
+— Crop output to width of terminal. Defaults to True.soft_wrap
+(bool, optional)
+— Enable soft wrap mode which disables word wrapping and cropping of text or None
forConsole default. Defaults to None
.
+new_line_start
+(bool, False)
+— Insert a new line at the start if the output contains more than one line. Defaults to False
.print_json
(
json=None
, data=None
, indent=2
, highlight=True
, skip_keys=False
, ensure_ascii=False
, check_circular=True
, allow_nan=True
, default=None
, sort_keys=False
)
Pretty prints JSON. Output will be valid JSON.
json
+(Optional[str])
+— A string containing JSON.data
+(Any)
+— If json is not supplied, then encode this data.indent
+(Union[None, int, str], optional)
+— Number of spaces to indent. Defaults to 2.highlight
+(bool, optional)
+— Enable highlighting of output: Defaults to True.skip_keys
+(bool, optional)
+— Skip keys not of a basic type. Defaults to False.ensure_ascii
+(bool, optional)
+— Escape all non-ascii characters. Defaults to False.check_circular
+(bool, optional)
+— Check for circular references. Defaults to True.allow_nan
+(bool, optional)
+— Allow NaN and Infinity values. Defaults to True.default
+(Callable, optional)
+— A callable that converts values that can not be encodedin to something that can be JSON encoded. Defaults to None.
+sort_keys
+(bool, optional)
+— Sort dictionary keys. Defaults to False.update_screen
(
renderable
, region=None
, options=None
)
Update the screen at a given offset.
renderable
+(RenderableType)
+— A Rich renderable.region
+(Region, optional)
+— Region of screen to update, or None for entire screen. Defaults to None.errors.NoAltScreen
+
+— If the Console isn't in alt screen mode.update_screen_lines
(
lines
, x=0
, y=0
)
Update lines of the screen at a given offset.
lines
+(List[List[Segment]])
+— Rendered lines (as produced by :meth:~rich.Console.render_lines
).x
+(int, optional)
+— x offset (column no). Defaults to 0.y
+(int, optional)
+— y offset (column no). Defaults to 0.errors.NoAltScreen
+
+— If the Console isn't in alt screen mode.print_exception
(
width=100
, extra_lines=3
, theme=None
, word_wrap=False
, show_locals=False
, suppress=()
, max_frames=100
)
Prints a rich render of the last exception and traceback.
width
+(Optional[int], optional)
+— Number of characters used to render code. Defaults to 100.extra_lines
+(int, optional)
+— Additional lines of code to render. Defaults to 3.theme
+(str, optional)
+— Override pygments theme used in tracebackword_wrap
+(bool, optional)
+— Enable word wrapping of long lines. Defaults to False.show_locals
+(bool, optional)
+— Enable display of local variables. Defaults to False.suppress
+(Iterable[Union[str, ModuleType]])
+— Optional sequence of modules or paths to exclude from traceback.max_frames
+(int)
+— Maximum number of frames to show in a traceback, 0 for no maximum. Defaults to 100.log
(
*objects
, sep=' '
, end='\n'
, style=None
, justify=None
, emoji=None
, markup=None
, highlight=None
, log_locals=False
, _stack_offset=1
)
Log rich content to the terminal.
sep
+(str, optional)
+— String to write between print data. Defaults to " ".end
+(str, optional)
+— String to write at end of print data. Defaults to "\n".style
+(Union[str, Style], optional)
+— A style to apply to output. Defaults to None.justify
+(str, optional)
+— One of "left", "right", "center", or "full". Defaults to None
.emoji
+(Optional[bool], optional)
+— Enable emoji code, or None
to use console default. Defaults to None.markup
+(Optional[bool], optional)
+— Enable markup, or None
to use console default. Defaults to None.highlight
+(Optional[bool], optional)
+— Enable automatic highlighting, or None
to use console default. Defaults to None.log_locals
+(bool, optional)
+— Boolean to enable logging of locals where log()
was called. Defaults to False.
+_stack_offset
+(int, optional)
+— Offset of caller from end of call stack. Defaults to 1.on_broken_pipe
(
)
This function is called when a BrokenPipeError
is raised.
This can occur when piping Textual output in Linux and macOS. +The default implementation is to exit the app, but you could implement +this method in a subclass to change the behavior.
+See https://docs.python.org/3/library/signal.html#note-on-sigpipe for details.
+input
(
prompt=''
, markup=True
, emoji=True
, password=False
, stream=None
)
Displays a prompt and waits for input from the user. The prompt may contain color / style.
It works in the same way as Python's builtin :func:input
function and provides elaborate line editing and history features if Python's builtin :mod:readline
module is previously loaded.
prompt
+(Union[str, Text])
+— Text to render in the prompt.markup
+(bool, optional)
+— Enable console markup (requires a str prompt). Defaults to True.emoji
+(bool, optional)
+— Enable emoji (requires a str prompt). Defaults to True.password
+(bool, optional)
+— (bool, optional): Hide typed text. Defaults to False.stream
+(Optional, optional)
+— (TextIO, optional): Optional file to read input from (rather than stdin). Defaults to None.Text read from stdin.
export_text
(
clear=True
, styles=False
)
Generate text from console contents (requires record=True argument in constructor).
clear
+(bool, optional)
+— Clear record buffer after exporting. Defaults to True
.styles
+(bool, optional)
+— If True
, ansi escape codes will be included. False
for plain text.Defaults to False
.
+String containing console contents.
save_text
(
path
, clear=True
, styles=False
)
Generate text from console and save to a given location (requires record=True argument in constructor).
path
+(str)
+— Path to write text files.clear
+(bool, optional)
+— Clear record buffer after exporting. Defaults to True
.styles
+(bool, optional)
+— If True
, ansi style codes will be included. False
for plain text.Defaults to False
.
+export_html
(
theme=None
, clear=True
, code_format=None
, inline_styles=False
)
Generate HTML from console contents (requires record=True argument in constructor).
theme
+(TerminalTheme, optional)
+— TerminalTheme object containing console colors.clear
+(bool, optional)
+— Clear record buffer after exporting. Defaults to True
.code_format
+(str, optional)
+— Format string to render HTML. In addition to '{foreground}','{background}', and '{code}', should contain '{stylesheet}' if inline_styles is False
.
+inline_styles
+(bool, optional)
+— If True
styles will be inlined in to spans, which makes fileslarger but easier to cut and paste markup. If False
, styles will be embedded in a style tag.
+Defaults to False.
+String containing console contents as HTML.
save_html
(
path
, theme=None
, clear=True
, code_format='<!DOCTYPE html>\n<html>\n<head>\n<meta charset="UTF-8">\n<style>\n{stylesheet}\nbody {{\n color: {foreground};\n background-color: {background};\n}}\n</style>\n</head>\n<body>\n <pre style="font-family:Menlo,\'DejaVu Sans Mono\',consolas,\'Courier New\',monospace"><code style="font-family:inherit">{code}</code></pre>\n</body>\n</html>\n'
, inline_styles=False
)
Generate HTML from console contents and write to a file (requires record=True argument in constructor).
path
+(str)
+— Path to write html file.theme
+(TerminalTheme, optional)
+— TerminalTheme object containing console colors.clear
+(bool, optional)
+— Clear record buffer after exporting. Defaults to True
.code_format
+(str, optional)
+— Format string to render HTML. In addition to '{foreground}','{background}', and '{code}', should contain '{stylesheet}' if inline_styles is False
.
+inline_styles
+(bool, optional)
+— If True
styles will be inlined in to spans, which makes fileslarger but easier to cut and paste markup. If False
, styles will be embedded in a style tag.
+Defaults to False.
+export_svg
(
title='Rich'
, theme=None
, clear=True
, code_format='<svg class="rich-terminal" viewBox="0 0 {width} {height}" xmlns="http://www.w3.org/2000/svg">\n <!-- Generated with Rich https://www.textualize.io -->\n <style>\n\n @font-face {{\n font-family: "Fira Code";\n src: local("FiraCode-Regular"),\n url("https://cdnjs.cloudflare.com/ajax/libs/firacode/6.2.0/woff2/FiraCode-Regular.woff2") format("woff2"),\n url("https://cdnjs.cloudflare.com/ajax/libs/firacode/6.2.0/woff/FiraCode-Regular.woff") format("woff");\n font-style: normal;\n font-weight: 400;\n }}\n @font-face {{\n font-family: "Fira Code";\n src: local("FiraCode-Bold"),\n url("https://cdnjs.cloudflare.com/ajax/libs/firacode/6.2.0/woff2/FiraCode-Bold.woff2") format("woff2"),\n url("https://cdnjs.cloudflare.com/ajax/libs/firacode/6.2.0/woff/FiraCode-Bold.woff") format("woff");\n font-style: bold;\n font-weight: 700;\n }}\n\n .{unique_id}-matrix {{\n font-family: Fira Code, monospace;\n font-size: {char_height}px;\n line-height: {line_height}px;\n font-variant-east-asian: full-width;\n }}\n\n .{unique_id}-title {{\n font-size: 18px;\n font-weight: bold;\n font-family: arial;\n }}\n\n {styles}\n </style>\n\n <defs>\n <clipPath id="{unique_id}-clip-terminal">\n <rect x="0" y="0" width="{terminal_width}" height="{terminal_height}" />\n </clipPath>\n {lines}\n </defs>\n\n {chrome}\n <g transform="translate({terminal_x}, {terminal_y})" clip-path="url(#{unique_id}-clip-terminal)">\n {backgrounds}\n <g class="{unique_id}-matrix">\n {matrix}\n </g>\n </g>\n</svg>\n'
, font_aspect_ratio=0.61
, unique_id=None
)
→ strGenerate an SVG from the console contents (requires record=True in Console constructor).
title
+(str, optional)
+— The title of the tab in the output imagetheme
+(TerminalTheme, optional)
+— The TerminalTheme
object to use to style the terminalclear
+(bool, optional)
+— Clear record buffer after exporting. Defaults to True
code_format
+(str, optional)
+— Format string used to generate the SVG. Rich will inject a number of variablesinto the string in order to form the final SVG output. The default template used and the variables
+injected by Rich can be found by inspecting the console.CONSOLE_SVG_FORMAT
variable.
+font_aspect_ratio
+(float, optional)
+— The width to height ratio of the font used in the code_format
string. Defaults to 0.61, which is the width to height ratio of Fira Code (the default font).
+If you aren't specifying a different font inside code_format
, you probably don't need this.
+unique_id
+(str, optional)
+— unique id that is used as the prefix for various elements (CSS styles, nodeids). If not set, this defaults to a computed value based on the recorded content.
+save_svg
(
path
, title='Rich'
, theme=None
, clear=True
, code_format='<svg class="rich-terminal" viewBox="0 0 {width} {height}" xmlns="http://www.w3.org/2000/svg">\n <!-- Generated with Rich https://www.textualize.io -->\n <style>\n\n @font-face {{\n font-family: "Fira Code";\n src: local("FiraCode-Regular"),\n url("https://cdnjs.cloudflare.com/ajax/libs/firacode/6.2.0/woff2/FiraCode-Regular.woff2") format("woff2"),\n url("https://cdnjs.cloudflare.com/ajax/libs/firacode/6.2.0/woff/FiraCode-Regular.woff") format("woff");\n font-style: normal;\n font-weight: 400;\n }}\n @font-face {{\n font-family: "Fira Code";\n src: local("FiraCode-Bold"),\n url("https://cdnjs.cloudflare.com/ajax/libs/firacode/6.2.0/woff2/FiraCode-Bold.woff2") format("woff2"),\n url("https://cdnjs.cloudflare.com/ajax/libs/firacode/6.2.0/woff/FiraCode-Bold.woff") format("woff");\n font-style: bold;\n font-weight: 700;\n }}\n\n .{unique_id}-matrix {{\n font-family: Fira Code, monospace;\n font-size: {char_height}px;\n line-height: {line_height}px;\n font-variant-east-asian: full-width;\n }}\n\n .{unique_id}-title {{\n font-size: 18px;\n font-weight: bold;\n font-family: arial;\n }}\n\n {styles}\n </style>\n\n <defs>\n <clipPath id="{unique_id}-clip-terminal">\n <rect x="0" y="0" width="{terminal_width}" height="{terminal_height}" />\n </clipPath>\n {lines}\n </defs>\n\n {chrome}\n <g transform="translate({terminal_x}, {terminal_y})" clip-path="url(#{unique_id}-clip-terminal)">\n {backgrounds}\n <g class="{unique_id}-matrix">\n {matrix}\n </g>\n </g>\n</svg>\n'
, font_aspect_ratio=0.61
, unique_id=None
)
Generate an SVG file from the console contents (requires record=True in Console constructor).
path
+(str)
+— The path to write the SVG to.title
+(str, optional)
+— The title of the tab in the output imagetheme
+(TerminalTheme, optional)
+— The TerminalTheme
object to use to style the terminalclear
+(bool, optional)
+— Clear record buffer after exporting. Defaults to True
code_format
+(str, optional)
+— Format string used to generate the SVG. Rich will inject a number of variablesinto the string in order to form the final SVG output. The default template used and the variables
+injected by Rich can be found by inspecting the console.CONSOLE_SVG_FORMAT
variable.
+font_aspect_ratio
+(float, optional)
+— The width to height ratio of the font used in the code_format
string. Defaults to 0.61, which is the width to height ratio of Fira Code (the default font).
+If you aren't specifying a different font inside code_format
, you probably don't need this.
+unique_id
+(str, optional)
+— unique id that is used as the prefix for various elements (CSS styles, nodeids). If not set, this defaults to a computed value based on the recorded content.
+pipen.utils.
get_logger
(
name='core'
, level=None
)
Get the logger by given plugin name
level
+(str | int, optional)
+— The initial level of the loggerThe logger
pipen.utils.
desc_from_docstring
(
obj
, base
)
Get the description from docstring
Only extract the summary.
+obj
+(Type[Pipen | Proc])
+— The object with docstringThe summary as desc
pipen.utils.
update_dict
(
parent
, new
, depth=0
)
Update the new dict to the parent, but make sure parent does not change
parent
+(Mapping)
+— The parent dictionarynew
+(Mapping)
+— The new dictionarydepth
+(int, optional)
+— The depth to be copied. 0 for updating to the deepest level.>>> parent = {"a": {"b": 1}}>>> new = {"a": {"c": 2}}
+>>> update_dict(parent, new)
+>>> # {"a": {"b": 1, "c": 2}}
+
The updated dictionary or None if both parent and new are None.
pipen.utils.
strsplit
(
string
, sep
, maxsplit=-1
, trim='both'
)
→ ListSplit the string, with the ability to trim each part.
pipen.utils.
get_shebang
(
script
)
Get the shebang of the script
script
+(str)
+— The script stringNone if the script does not contain a shebang, otherwise the shebangwithout #!
prefix
pipen.utils.
ignore_firstline_dedent
(
text
)
Like textwrap.dedent(), but ignore first empty lines
text
+(str)
+— The text the be dedentedThe dedented text
pipen.utils.
copy_dict
(
dic
, depth=1
)
Deep copy a dict
dic
+(Mapping)
+— The dict to be copieddepth
+(int, optional)
+— The depth to be deep copiedThe deep-copied dict
pipen.utils.
get_logpanel_width
(
)
Get the width of the log content
max_width
+
+— The maximum width to returnNote that it's not the console width. With console width, you
+have to subtract the width of the log meta info
+(CONSOLE_WIDTH_SHIFT).
+The width of the log content
pipen.utils.
log_rich_renderable
(
renderable
, color
, logfunc
, *args
, **kwargs
)
Log a rich renderable to logger
renderable
+(RenderableType)
+— The rich renderablelogfunc
+(Callable)
+— The log function, if message is not the first argument,use functools.partial to wrap it
+*args
+(Any)
+— The arguments to the log function**kwargs
+(Any)
+— The keyword arguments to the log functionsplitline
+
+— Whether split the lines or log the entire messagepipen.utils.
brief_list
(
blist
)
Briefly show an integer list, combine the continuous numbers.
blist
+(List)
+— The listThe string to show for the briefed list.
pipen.utils.
pipen_banner
(
)
The banner for pipen
The banner renderable
pipen.utils.
get_mtime
(
path
, dir_depth=1
)
Get the modification time of a path.If path is a directory, try to get the last modification time of the +contents in the directory at given dir_depth
+dir_depth
+(int, optional)
+— The depth of the directory to check thelast modification time
+The last modification time of path
pipen.utils.
is_subclass
(
obj
, cls
)
Tell if obj is a subclass of clsDifferences with issubclass is that we don't raise Type error if obj +is not a class
+obj
+(Any)
+— The object to checkcls
+(type)
+— The class to checkTrue if obj is a subclass of cls otherwise False
pipen.utils.
load_entrypoints
(
group
)
Load objects from setuptools entrypoints by given group name
group
+(str)
+— The group name of the entrypointsAn iterable of tuples with name and the loaded object
pipen.utils.
truncate_text
(
text
, width
, end='…'
)
Truncate a text not based on words/whitespacesOtherwise, we could use textwrap.shorten.
+text
+(str)
+— The text to be truncatedwidth
+(int)
+— The max width of the the truncated textend
+(str, optional)
+— The end string of the truncated textThe truncated text with end appended.
pipen.utils.
make_df_colnames_unique_inplace
(
thedf
)
Make the columns of a data frame unique
thedf
+(pandas.DataFrame)
+— The data framepipen.utils.
get_base
(
klass
, abc_base
, value
, value_getter
)
Get the base class where the value was first defined
klass
+(Type)
+— The classabc_base
+(Type)
+— The very base class to check in basesvalue
+(Any)
+— The value to checkvalue_getter
+(Callable)
+— How to get the value from the classThe base class
pipen.utils.
mark
(
**kwargs
)
Mark a class (e.g. Proc) with given kwargs as metadata
These marks will not be inherited by the subclasses if the class is
+a subclass of Proc
or ProcGroup
.
**kwargs
+
+— The kwargs to mark the procThe decorator
pipen.utils.
get_marked
(
cls
, mark_name
, default=None
)
Get the marked value from a proc
cls
+(type)
+— The procmark_name
+(str)
+— The mark namedefault
+(Any, optional)
+— The default value if the mark is not foundThe marked value
pipen.utils.
is_valid_name
(
name
)
Check if a name is valid for a proc or pipen
name
+(str)
+— The name to checkTrue if valid, otherwise False
pipen.utils.
load_pipeline
(
obj
, argv0=None
, argv1p=None
, **kwargs
)
Load a pipeline from a Pipen, Proc or ProcGroup object
It does not only load the Pipen object or convert the Proc/ProcGroup
+object to Pipen, but also build the process relationships. So that we
+can access pipeline.procs
and requires/nexts
of each proc.
To avoid running the pipeline and notify the plugins that this is just
+for loading the pipeline, sys.argv[0]
is set to @pipen
.
obj
+(str | Type[Proc] | Type[ProcGroup] | Type[Pipen])
+— The Pipen, Proc or ProcGroup object. It can also be a string inthe format of part1:part2
to load the pipeline, where part1 is
+a path to a python file or package directory, and part2 is the name
+of the proc, procgroup or pipeline to load.
+It should be able to be loaded by getattr(module, part2)
, where
+module is loaded from part1
.
+argv0
+(str | none, optional)
+— The value to replace sys.argv[0]. "@pipen" will be usedby default.
+argv1p
+(Optional, optional)
+— The values to replace sys.argv[1:]. Do not replace by default.kwargs
+
+— The kwargs to pass to the Pipen constructorThe loaded Pipen object
TypeError
+
+— If obj or loaded obj is not a Pipen, Proc or ProcGrouppipen.utils.
is_loading_pipeline
(
*flags
, argv=None
)
Check if we are loading the pipeline. Works only whenargv0
is "@pipen" while loading the pipeline.
Note if you are using this function at compile time, make
+sure you load your pipeline using the string form (part1:part2
)
+See more with load_pipline()
.
*flags
+(str)
+— Additional flags to check in sys.argv (e.g. "-h", "--help")to determine if we are loading the pipeline
+argv
+(Optional, optional)
+— The arguments to check. sys.argv is used by default.Note that the first argument should be included in the check.
+You could typically pass [sys.argv[0], *your_args]
to this if you want
+to check if sys.argv[0]
is "@pipen" or your_args
contains some flags.
+True if we are loading the pipeline (argv[0] == "@pipen"),otherwise False
+A pipeline framework for python
Provide some function for creating and modifying channels (dataframes)
collapse_files
(
data
, col
)
+(DataFrame)
+— Collapse a Channel according to the files in expand_dir
(
data
, col
, pattern
, ftype
, sortby
, reverse
)
+(DataFrame)
+— Expand a Channel according to the files in Provide exception classes
PipenException
+
+— Base exception class for pipen</>PipenSetDataError
+
+— When trying to set input data to processes with input_data already setusing Pipen.set_data().
+</>ProcInputTypeError
+
+— When an unsupported input type is provided</>ProcInputKeyError
+
+— When an unsupported input key is provided</>ProcInputValueError
+
+— When an unsupported input value is provided</>ProcScriptFileNotFound
+
+— When script file specified as 'file://' cannot be found</>ProcOutputNameError
+
+— When no name or malformatted output is provided</>ProcOutputTypeError
+
+— When an unsupported output type is provided</>ProcOutputValueError
+
+— When a malformatted output value is provided</>ProcDependencyError
+
+— When there is something wrong the process dependencies</>NoSuchSchedulerError
+
+— When specified scheduler cannot be found</>WrongSchedulerTypeError
+
+— When specified scheduler is not a subclass of Scheduler</>NoSuchTemplateEngineError
+
+— When specified template engine cannot be found</>WrongTemplateEnginTypeError
+
+— When specified tempalte engine is not a subclass of Scheduler</>TemplateRenderingError
+
+— Failed to render a template</>ConfigurationError
+
+— When something wrong set as configuration</>PipenOrProcNameError
+
+— "When more than one processes are sharing the same workdir</>Provide some default values/objects
ProcInputType
+
+— Types for process inputs</>ProcOutputType
+
+— Types for process outputs</>Define hooks specifications and provide plugin manager
PipenMainPlugin
+
+— The builtin core plugin, used to update the progress bar andcache the job
+</>XqutePipenPlugin
+
+— The plugin for xqute working as proxy for pipen plugin hooks</>clear_path
(
job
, path
, is_dir
)
+(bool)
+— Clear the path, either a file or a directory</>get_mtime
(
job
, path
, dirsig
)
+(float)
+— Get the mtime of a path, either a file or a directory</>norm_inpath
(
job
, inpath
, is_dir
)
+(str)
+— Normalize the input path</>norm_outpath
(
job
, outpath
, is_dir
)
+(str)
+— Normalize the output path</>on_complete
(
pipen
, succeeded
)
+
+— The the pipeline is completed.</>on_init
(
pipen
)
+
+— When the pipeline is initialized, and default configs are loaded</>on_job_cached
(
job
)
+
+— When a job is cached.</>on_job_failed
(
job
)
+
+— When a job is done but failed.</>on_job_init
(
job
)
+
+— When a job is initialized</>on_job_killed
(
job
)
+
+— When a job is killed</>on_job_killing
(
job
)
+(bool)
+— When a job is being killed.</>on_job_polling
(
job
)
+
+— When status of a job is being polled.</>on_job_queued
(
job
)
+
+— When a job is queued in xqute. Note it might not be queued yet inthe scheduler system.
+</>on_job_started
(
job
)
+
+— When a job starts to run in then scheduler system.</>on_job_submitted
(
job
)
+
+— When a job is submitted in the scheduler system.</>on_job_submitting
(
job
)
+(bool)
+— When a job is submitting.</>on_job_succeeded
(
job
)
+
+— When a job completes successfully.</>on_jobcmd_end
(
job
)
+(str)
+— When the job command finishes and after the postscript is run</>on_jobcmd_init
(
job
)
+(str)
+— When the job command wrapper script is initialized before the prescript is run</>on_jobcmd_prep
(
job
)
+(str)
+— When the job command right about to be run</>on_proc_create
(
proc
)
+
+— Called Proc constructor when a process is created.</>on_proc_done
(
proc
, succeeded
)
+
+— When a process is done</>on_proc_init
(
proc
)
+
+— Called when a process is initialized.</>on_proc_input_computed
(
proc
)
+
+— Called after process input data is computed.</>on_proc_script_computed
(
proc
)
+
+— Called after process script is computed.</>on_proc_shutdown
(
proc
, sig
)
+
+— When pipeline is shutting down, by Ctrl-c for example.</>on_proc_start
(
proc
)
+
+— When a process is starting</>on_setup
(
config
)
+
+— Setup for plugins, primarily used for the plugins tosetup some default configurations.
+</>on_start
(
pipen
)
+
+— Right before the pipeline starts running.</>output_exists
(
job
, path
, is_dir
)
+(bool)
+— Check if the output exists</>Process group that contains a set of processes.
It can be easily used to create a pipeline that runs independently or +integrated into a larger pipeline.
+Runs directly: +
>>> proc_group = ProcGroup(<options>)
+>>> proc_group.as_pipen(<pipeline options>).set_data(<data>).run()
+
Integrated into a larger pipeline +
>>> proc_group = ProcGroup(<options>)
+>>> # proc could be a process within the larger pipeline
+>>> proc.requires = prog_group.<proc>
+
To add a process to the proc group, use the add_proc
method:
+
>>> class MyProcGroup(ProcGroup):
+>>> ...
+>>>
+>>> proc_group = MyProcGroup(...)
+>>> @proc_group.add_proc
+>>> class MyProc(Proc):
+>>> ...
+
Or add a process at runtime: +
>>> class MyProcGroup(ProcGroup):
+>>> ...
+>>>
+>>> @ProcGroup.add_proc
+>>> def my_proc(self):
+>>> class MyProc(Proc):
+>>> # You may use self.options here
+>>> ...
+>>> return MyProc
+>>> proc_group = MyProcGroup(...)
+
ProcGropuMeta
+
+— Meta class for ProcGroup</>ProcGroup
+
+— A group of processes that can be run independently orintegrated into a larger pipeline.
+</>Provide some utilities
RichHandler
+
+— Subclass of rich.logging.RichHandler, showing log levels as a singlecharacter
+</>RichConsole
+
+— A high level console interface.</>brief_list
(
blist
)
+(str)
+— Briefly show an integer list, combine the continuous numbers.</>copy_dict
(
dic
, depth
)
+(Mapping)
+— Deep copy a dict</>desc_from_docstring
(
obj
, base
)
+(str)
+— Get the description from docstring</>get_base
(
klass
, abc_base
, value
, value_getter
)
+(Type)
+— Get the base class where the value was first defined</>get_logger
(
name
, level
)
+(LoggerAdapter)
+— Get the logger by given plugin name</>get_logpanel_width
(
)
+(int)
+— Get the width of the log content</>get_marked
(
cls
, mark_name
, default
)
+(Any)
+— Get the marked value from a proc</>get_mtime
(
path
, dir_depth
)
+(float)
+— Get the modification time of a path.If path is a directory, try to get the last modification time of the
+contents in the directory at given dir_depth
+</>get_shebang
(
script
)
+(str)
+— Get the shebang of the script</>ignore_firstline_dedent
(
text
)
+(str)
+— Like textwrap.dedent(), but ignore first empty lines</>is_loading_pipeline
(
*flags
, argv
)
+(bool)
+— Check if we are loading the pipeline. Works only whenargv0
is "@pipen" while loading the pipeline.
+</>is_subclass
(
obj
, cls
)
+(bool)
+— Tell if obj is a subclass of clsDifferences with issubclass is that we don't raise Type error if obj
+is not a class
+</>is_valid_name
(
name
)
+(bool)
+— Check if a name is valid for a proc or pipen</>load_entrypoints
(
group
)
+(Iterable)
+— Load objects from setuptools entrypoints by given group name</>load_pipeline
(
obj
, argv0
, argv1p
, **kwargs
)
+(Pipen)
+— Load a pipeline from a Pipen, Proc or ProcGroup object</>log_rich_renderable
(
renderable
, color
, logfunc
, *args
, **kwargs
)
+
+— Log a rich renderable to logger</>make_df_colnames_unique_inplace
(
thedf
)
+
+— Make the columns of a data frame unique</>mark
(
**kwargs
)
+(Callable)
+— Mark a class (e.g. Proc) with given kwargs as metadata</>pipen_banner
(
)
+(RenderableType)
+— The banner for pipen</>strsplit
(
string
, sep
, maxsplit
, trim
)
+(List)
+— Split the string, with the ability to trim each part.</>truncate_text
(
text
, width
, end
)
+(str)
+— Truncate a text not based on words/whitespacesOtherwise, we could use textwrap.shorten.
+</>update_dict
(
parent
, new
, depth
)
+(Mapping)
+— Update the new dict to the parent, but make sure parent does not change</>Template adaptor for pipen
Template
(
source
, **kwargs
)
+
+— Base class wrapper to wrap template for pipen</>TemplateLiquid
+
+— Liquidpy template wrapper.</>TemplateJinja2
+
+— Jinja2 template wrapper</>get_template_engine
(
template
)
+(Type)
+— Get the template engine by name or the template engine itself</>Provide builting schedulers
LocalJob
+
+— Job class for local scheduler</>LocalScheduler
+
+— Local scheduler</>SgeJob
+
+— Job class for SGE scheduler</>SgeScheduler
+
+— SGE scheduler</>SlurmJob
+
+— Job class for Slurm scheduler</>SlurmScheduler
+
+— Slurm scheduler</>SshJob
+
+— Job class for SSH scheduler</>SshScheduler
+
+— SSH scheduler</>get_scheduler
(
scheduler
)
+(Type)
+— Get the scheduler by name of the scheduler class itself</>Provide the PipelinePBar and ProcPBar classes
ProcPBar
+
+— The progress bar for processes</>PipelinePBar
+
+— Progress bar for the pipeline</>"""Provide some function for creating and modifying channels (dataframes)"""
+from __future__ import annotations
+
+from glob import glob
+from os import path
+from typing import Any, List
+
+import pandas
+from pandas import DataFrame
+from pipda import register_verb
+
+
+# ----------------------------------------------------------------
+# Creators
+class Channel(DataFrame):DOCS
+ """A DataFrame wrapper with creators"""
+
+ @classmethodDOCS
+ def create(cls, value: DataFrame | List[Any]) -> DataFrame:
+ """Create a channel from a list.
+
+ The second dimension is identified by tuple. if all elements are tuple,
+ then a channel is created directly. Otherwise, elements are converted
+ to tuples first and channels are created then.
+
+ Examples:
+ >>> Channel.create([1, 2, 3]) # 3 rows, 1 column
+ >>> Channel.create([(1,2,3)]) # 1 row, 3 columns
+
+ Args:
+ value: The value to create a channel
+
+ Returns:
+ A channel (dataframe)
+ """
+ if isinstance(value, DataFrame):
+ return value
+ if all(isinstance(elem, tuple) for elem in value):
+ return cls(value)
+ return cls((val,) for val in value)
+
+ @classmethodDOCS
+ def from_glob(
+ cls,
+ pattern: str,
+ ftype: str = "any",
+ sortby: str = "name",
+ reverse: bool = False,
+ ) -> DataFrame:
+ """Create a channel with a glob pattern
+
+ Args:
+ ftype: The file type, one of any, link, dir and file
+ sortby: How the files should be sorted. One of name, mtime and size
+ reverse: Whether sort them in a reversed way.
+
+ Returns:
+ The channel
+ """
+ sort_key = (
+ str
+ if sortby == "name"
+ else path.getmtime
+ if sortby == "mtime"
+ else path.getsize
+ if sortby == "size"
+ else None
+ )
+ file_filter = (
+ path.islink
+ if ftype == "link"
+ else path.isdir
+ if ftype == "dir"
+ else path.isfile
+ if ftype == "file"
+ else None
+ )
+ files = (
+ file
+ for file in glob(str(pattern))
+ if not file_filter or file_filter(file)
+ )
+ return cls.create(
+ sorted(files, key=sort_key, reverse=reverse), # type: ignore
+ )
+
+ @classmethodDOCS
+ def from_pairs(
+ cls,
+ pattern: str,
+ ftype: str = "any",
+ sortby: str = "name",
+ reverse: bool = False,
+ ) -> DataFrame:
+ """Create a width=2 channel with a glob pattern
+
+ Args:
+ ftype: The file type, one of any, link, dir and file
+ sortby: How the files should be sorted. One of name, mtime and size
+ reverse: Whether sort them in a reversed way.
+
+ Returns:
+ The channel
+ """
+ mates = cls.from_glob(pattern, ftype, sortby, reverse)
+ return pandas.concat(
+ (
+ mates.iloc[::2].reset_index(drop=True),
+ mates.iloc[1::2].reset_index(drop=True),
+ ),
+ axis=1,
+ )
+
+ @classmethodDOCS
+ def from_csv(cls, *args, **kwargs):
+ """Create a channel from a csv file
+
+ Uses pandas.read_csv() to create a channel
+
+ Args:
+ *args: and
+ **kwargs: Arguments passing to pandas.read_csv()
+ """
+ return pandas.read_csv(*args, **kwargs)
+
+ @classmethodDOCS
+ def from_excel(cls, *args, **kwargs):
+ """Create a channel from an excel file.
+
+ Uses pandas.read_excel() to create a channel
+
+ Args:
+ *args: and
+ **kwargs: Arguments passing to pandas.read_excel()
+ """
+ return pandas.read_excel(*args, **kwargs)
+
+ @classmethodDOCS
+ def from_table(cls, *args, **kwargs):
+ """Create a channel from a table file.
+
+ Uses pandas.read_table() to create a channel
+
+ Args:
+ *args: and
+ **kwargs: Arguments passing to pandas.read_table()
+ """
+ return pandas.read_table(*args, **kwargs)
+
+
+# ----------------------------------------------------------------
+# Verbs
+@register_verb(DataFrame)DOCS
+def expand_dir(
+ data: DataFrame,
+ col: str | int = 0,
+ pattern: str = "*",
+ ftype: str = "any",
+ sortby: str = "name",
+ reverse: bool = False,
+) -> DataFrame:
+ """Expand a Channel according to the files in <col>,
+ other cols will keep the same.
+
+ This is only applicable to a 1-row channel.
+
+ Examples:
+ >>> ch = channel.create([('./', 1)])
+ >>> ch >> expand()
+ >>> [['./a', 1], ['./b', 1], ['./c', 1]]
+
+ Args:
+ col: the index or name of the column used to expand
+ pattern: use a pattern to filter the files/dirs, default: `*`
+ ftype: the type of the files/dirs to include
+ - 'dir', 'file', 'link' or 'any' (default)
+ sortby: how the list is sorted
+ - 'name' (default), 'mtime', 'size'
+ reverse: reverse sort.
+
+ Returns:
+ The expanded channel
+ """
+ assert data.shape[0] == 1, "Can only expand a single row DataFrame."
+ col_loc = col if isinstance(col, int) else data.columns.get_loc(col)
+ full_pattern = f"{data.iloc[0, col_loc]}/{pattern}"
+ expanded = Channel.from_glob(
+ full_pattern,
+ ftype,
+ sortby,
+ reverse,
+ ).iloc[:, 0]
+ ret = pandas.concat([data] * expanded.size, axis=0, ignore_index=True)
+ ret.iloc[:, col_loc] = expanded.values
+ return ret.reset_index(drop=True)
+
+
+@register_verb(DataFrame)DOCS
+def collapse_files(data: DataFrame, col: str | int = 0) -> DataFrame:
+ """Collapse a Channel according to the files in <col>,
+ other cols will use the values in row 0.
+
+ Note that other values in other rows will be discarded.
+
+ Examples:
+ >>> ch = channel.create([['./a', 1], ['./b', 1], ['./c', 1]])
+ >>> ch >> collapse()
+ >>> [['.', 1]]
+
+ Args:
+ data: The original channel
+ col: the index or name of the column used to collapse on
+
+ Returns:
+ The collapsed channel
+ """
+ assert data.shape[0] > 0, "Cannot collapse on an empty DataFrame."
+ col_loc = col if isinstance(col, int) else data.columns.get_loc(col)
+ paths = list(data.iloc[:, col_loc])
+ compx = path.dirname(path.commonprefix(paths))
+ ret = data.iloc[[0], :].copy()
+ ret.iloc[0, col_loc] = compx
+ return ret
+
"""Print help for commands"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
+from ._hooks import CLIPlugin
+
+if TYPE_CHECKING:
+ from argx import ArgumentParser
+ from argparse import Namespace
+
+__all__ = ("CLIHelpPlugin",)
+
+
+class CLIHelpPlugin(CLIPlugin):DOCS
+ """Print help for commands"""
+
+ name = "help"
+
+ def __init__(self, parser: ArgumentParser, subparser: ArgumentParser):
+ super().__init__(parser, subparser)
+ subparser.add_argument(
+ "cmd",
+ nargs="?",
+ choices=[
+ n
+ for n in parser._subparsers._group_actions[0].choices
+ if n != "help"
+ ],
+ help="The command to show help for",
+ )
+
+ def exec_command(self, args: Namespace) -> None:DOCS
+ """Run the command"""
+
+ if not args.cmd:
+ self.parser.parse_args(["--help"])
+ else:
+ self.parser.parse_args([args.cmd, "--help"])
+
"""List plugins"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, Iterable, List, Tuple
+
+from rich import print
+
+from ._hooks import CLIPlugin
+from ..defaults import (
+ CLI_ENTRY_GROUP,
+ SCHEDULER_ENTRY_GROUP,
+ TEMPLATE_ENTRY_GROUP,
+)
+from ..utils import load_entrypoints
+
+if TYPE_CHECKING:
+ from argx import ArgumentParser
+ from argparse import Namespace
+
+
+COMMAND = "plugins"
+GROUPS = [
+ "pipen",
+ SCHEDULER_ENTRY_GROUP,
+ TEMPLATE_ENTRY_GROUP,
+ CLI_ENTRY_GROUP,
+]
+GROUP_NAMES = {
+ "pipen": "Pipen",
+ SCHEDULER_ENTRY_GROUP: "Scheduler",
+ TEMPLATE_ENTRY_GROUP: "Template",
+ CLI_ENTRY_GROUP: "CLI",
+}
+
+__all__ = ("CliPluginsPlugin",)
+
+
+def _get_plugins_by_group(group: str) -> Iterable[Tuple[str, Any]]:
+ """Get plugins from entry points by group name
+
+ Args:
+ group: The name of the group
+
+ Returns:
+ A list of tuples with the plugin name and the plugin itself
+ """
+ for name, obj in load_entrypoints(group):
+ yield name, obj
+
+
+def _list_group_plugins(
+ group: str,
+ plugins: List[Tuple[str, Any]],
+) -> None:
+ """List plugins in a single group
+
+ Args:
+ group: The group of the plugins
+ plugins: A list of tuples with name and plugin
+ """
+ print("")
+ print(f"[bold][u]{GROUP_NAMES[group]} plugins:[/u][/bold]")
+ namelen = max(len(name) for name, _ in plugins) if plugins else 0
+ for name, plugin in plugins:
+ try:
+ ver = plugin.version
+ except AttributeError:
+ try:
+ ver = plugin.__version__
+ except AttributeError:
+ ver = "unknown"
+ print(f"- {name.ljust(namelen)}: (version: {ver})")
+
+
+def _list_plugins(plugins: List[Tuple[str, str, Any]]) -> None:
+ """List plugins
+
+ Args:
+ plugins: A list of tuples with group, name and plugin
+ """
+ pipen_plugins = [
+ (name, plugin) for group, name, plugin in plugins if group == "pipen"
+ ]
+ sched_plugins = [
+ (name, plugin)
+ for group, name, plugin in plugins
+ if group == SCHEDULER_ENTRY_GROUP
+ ]
+ tpl_plugins = [
+ (name, plugin)
+ for group, name, plugin in plugins
+ if group == TEMPLATE_ENTRY_GROUP
+ ]
+ cli_plugins = [
+ (name, plugin)
+ for group, name, plugin in plugins
+ if group == CLI_ENTRY_GROUP
+ ]
+ _list_group_plugins("pipen", pipen_plugins)
+ _list_group_plugins(SCHEDULER_ENTRY_GROUP, sched_plugins)
+ _list_group_plugins(TEMPLATE_ENTRY_GROUP, tpl_plugins)
+ _list_group_plugins(CLI_ENTRY_GROUP, cli_plugins)
+
+
+class CliPluginsPlugin(CLIPlugin):DOCS
+ """List installed plugins"""
+
+ name = "plugins"
+
+ def __init__(
+ self,
+ parser: ArgumentParser,
+ subparser: ArgumentParser,
+ ) -> None:
+ super().__init__(parser, subparser)
+ subparser.add_argument(
+ "-g",
+ "--group",
+ choices=GROUPS + ["all"],
+ default="all",
+ help="The name of the entry point group. Show all if not provided",
+ )
+
+ def exec_command(self, args: Namespace) -> None:DOCS
+ """Execute the command"""
+ from ..version import __version__
+ print("Pipen version:", __version__)
+
+ plugins: List[Tuple[str, str, Any]] = []
+
+ if args.group and args.group != "all":
+ for name, plugin in _get_plugins_by_group(args.group):
+ plugins.append((args.group, name, plugin))
+
+ else: # args.name
+ for group in GROUPS:
+ for name, plugin in _get_plugins_by_group(group):
+ plugins.append((group, name, plugin))
+
+ _list_plugins(plugins)
+
"""List available profiles."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
+import rtoml # type: ignore
+from rich import print
+from rich.panel import Panel
+from rich.syntax import Syntax
+from simpleconf import ProfileConfig
+
+from ._hooks import CLIPlugin
+from ..defaults import CONFIG, CONFIG_FILES
+
+if TYPE_CHECKING:
+ from argx import ArgumentParser
+ from argparse import Namespace
+
+__all__ = ("CLIProfilePlugin",)
+
+
+class CLIProfilePlugin(CLIPlugin):DOCS
+ """List available profiles."""
+
+ name = "profile"
+
+ def __init__(
+ self,
+ parser: ArgumentParser,
+ subparser: ArgumentParser,
+ ) -> None:
+ super().__init__(parser, subparser)
+ subparser.add_argument(
+ "-n",
+ "--name",
+ default="",
+ help="The name of the profile to show. Show all if not provided.",
+ )
+ subparser.add_argument(
+ "-l",
+ "--list",
+ action="store_true",
+ default=False,
+ help="List the names of all available profiles (-n won't work).",
+ )
+
+ def exec_command(self, args: Namespace) -> None:DOCS
+ """Run the command"""
+
+ config = ProfileConfig.load(
+ {"default": CONFIG},
+ *CONFIG_FILES,
+ ignore_nonexist=True,
+ )
+
+ if args.list:
+ print("\n".join(ProfileConfig.profiles(config)))
+ return
+
+ print("Configurations loaded from:")
+ print("- pipen.defaults.CONFIG (python dictionary)")
+ for conffile in reversed(CONFIG_FILES):
+ print(f"- {conffile}")
+ print("")
+
+ print("Note:")
+ print(
+ "- The same profile from different configuration files "
+ "are inherited."
+ )
+ print(
+ "- These configurations can still be overriden by "
+ "Pipen constructor and process definition."
+ )
+ print("")
+
+ if not args.name:
+ for profile in ProfileConfig.profiles(config):
+ with ProfileConfig.with_profile(config, profile):
+ conf = ProfileConfig.detach(config)
+ print(
+ Panel(
+ Syntax(rtoml.dumps(conf), "toml"),
+ title=f"Profile: {profile}",
+ title_align="left",
+ )
+ )
+
+ else:
+ if not ProfileConfig.has_profile(config, args.name):
+ raise ValueError(f"No such profile: {args.name}")
+
+ ProfileConfig.use_profile(config, args.name)
+ conf = ProfileConfig.detach(config)
+ print(
+ Panel(
+ Syntax(rtoml.dumps(conf), "toml"),
+ title=f"Profile: {args.name}",
+ title_align="left",
+ )
+ )
+
"""Print help for commands"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
+from rich import print
+
+from ._hooks import CLIPlugin
+
+if TYPE_CHECKING:
+ from argparse import Namespace
+
+__all__ = ("CLIVersionPlugin",)
+
+
+class CLIVersionPlugin(CLIPlugin):DOCS
+ """Print versions of pipen and its dependencies"""
+
+ name = "version"
+
+ def exec_command(self, args: Namespace) -> None:DOCS
+ """Run the command"""
+ import sys
+ from importlib.metadata import version
+ from .. import __version__
+
+ versions = {"python": sys.version, "pipen": __version__}
+
+ for pkg in (
+ "liquidpy",
+ "pandas",
+ "enlighten",
+ "argx",
+ "xqute",
+ "python-simpleconf",
+ "pipda",
+ "varname",
+ ):
+ versions[pkg] = version(pkg)
+
+ keylen = max(map(len, versions))
+ for key in versions:
+ ver = versions[key]
+ verlines = ver.splitlines()
+ print(f"{key.ljust(keylen)}: {verlines.pop(0)}")
+ for verline in verlines: # pragma: no cover
+ print(f"{' ' * keylen} {verline}")
+
"""Provide CLI for pipen"""
+
+from ._hooks import CLIPlugin
+from ._main import main
+
"""Provide some default values/objects"""
+from pathlib import Path
+from typing import ClassVar
+
+from diot import Diot
+from xqute import JobErrorStrategy
+from xqute.utils import logger as xqute_logger
+
+# Remove the rich handler
+_xqute_handlers = xqute_logger.handlers
+if _xqute_handlers:
+ # The very first handler is the rich handler
+ xqute_logger.removeHandler(_xqute_handlers[0])
+
+LOGGER_NAME = "core"
+CONFIG_FILES = (
+ Path("~/.pipen.toml").expanduser(),
+ "./.pipen.toml",
+ "PIPEN.osenv",
+)
+CONFIG = Diot(
+ # pipeline level: The logging level
+ loglevel="info",
+ # process level: The cache option, True/False/export
+ cache=True,
+ # process level: Whether expand directory to check signature
+ dirsig=1,
+ # process level:
+ # How to deal with the errors
+ # retry, ignore, halt
+ # halt to halt the whole pipeline, no submitting new jobs
+ # terminate to just terminate the job itself
+ error_strategy=JobErrorStrategy.IGNORE,
+ # process level:
+ # How many times to retry to jobs once error occurs
+ num_retries=3,
+ # process level:
+ # The directory to export the output files
+ forks=1,
+ # process level: Default shell/language
+ lang="bash",
+ # process level:
+ # How many jobs to be submitted in a batch
+ submission_batch=8,
+ # pipeline level:
+ # The working directory for the pipeline
+ workdir="./.pipen",
+ # process level: template engine
+ template="liquid",
+ # process level: template options
+ template_opts={},
+ # process level: scheduler
+ scheduler="local",
+ # process level: scheduler options
+ scheduler_opts={},
+ # pipeline level: plugins
+ plugins=None,
+ # pipeline level: plugin opts
+ plugin_opts={},
+)
+
+# Just the total width of the terminal
+# when logging with a rich.Panel()
+CONSOLE_WIDTH_WITH_PANEL = 100
+# The width of the terminal when the width cannot be detected,
+# we are probably logging into a file
+CONSOLE_DEFAULT_WIDTH = 2048
+# [05/16/22 11:46:40] I
+# v0.3.4:
+# 05-16 11:11:11 I
+# The markup code is included
+# Don't modify this unless the logger formatter is changed
+CONSOLE_WIDTH_SHIFT = 25
+# For pipen scheduler plugins
+SCHEDULER_ENTRY_GROUP = "pipen_sched"
+# For pipen template plugins
+TEMPLATE_ENTRY_GROUP = "pipen_tpl"
+# For pipen template cli plugins
+CLI_ENTRY_GROUP = "pipen_cli"
+
+
+class ProcInputType:DOCS
+ """Types for process inputs"""
+
+ VAR: ClassVar[str] = "var"
+ FILE: ClassVar[str] = "file"
+ DIR: ClassVar[str] = "dir"
+ FILES: ClassVar[str] = "files"
+ DIRS: ClassVar[str] = "dirs"
+
+
+class ProcOutputType:DOCS
+ """Types for process outputs"""
+
+ VAR: ClassVar[str] = "var"
+ DIR: ClassVar[str] = "dir"
+ FILE: ClassVar[str] = "file"
+
"""Provide exception classes"""
+
+
+class PipenException(Exception):DOCS
+ """Base exception class for pipen"""
+
+
+class PipenSetDataError(PipenException, ValueError):DOCS
+ """When trying to set input data to processes with input_data already set
+ using Pipen.set_data()."""
+
+
+class ProcInputTypeError(PipenException, TypeError):DOCS
+ """When an unsupported input type is provided"""
+
+
+class ProcInputKeyError(PipenException, KeyError):DOCS
+ """When an unsupported input key is provided"""
+
+
+class ProcInputValueError(PipenException, ValueError):DOCS
+ """When an unsupported input value is provided"""
+
+
+class ProcScriptFileNotFound(PipenException, FileNotFoundError):DOCS
+ """When script file specified as 'file://' cannot be found"""
+
+
+class ProcOutputNameError(PipenException, NameError):DOCS
+ """When no name or malformatted output is provided"""
+
+
+class ProcOutputTypeError(PipenException, TypeError):DOCS
+ """When an unsupported output type is provided"""
+
+
+class ProcOutputValueError(PipenException, ValueError):DOCS
+ """When a malformatted output value is provided"""
+
+
+class ProcDependencyError(PipenException):DOCS
+ """When there is something wrong the process dependencies"""
+
+
+class NoSuchSchedulerError(PipenException):DOCS
+ """When specified scheduler cannot be found"""
+
+
+class WrongSchedulerTypeError(PipenException, TypeError):DOCS
+ """When specified scheduler is not a subclass of Scheduler"""
+
+
+class NoSuchTemplateEngineError(PipenException):DOCS
+ """When specified template engine cannot be found"""
+
+
+class WrongTemplateEnginTypeError(PipenException, TypeError):DOCS
+ """When specified tempalte engine is not a subclass of Scheduler"""
+
+
+class TemplateRenderingError(PipenException):DOCS
+ """Failed to render a template"""
+
+
+class ConfigurationError(PipenException):DOCS
+ """When something wrong set as configuration"""
+
+
+class PipenOrProcNameError(PipenException):DOCS
+ """ "When more than one processes are sharing the same workdir"""
+
"""Provide the Job class"""
+from __future__ import annotations
+
+import logging
+import shlex
+import shutil
+from functools import cached_property
+from os import PathLike
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Mapping
+
+from diot import OrderedDiot
+from xqute import Job as XquteJob
+from xqute.utils import a_read_text
+
+from ._job_caching import JobCaching
+from .defaults import ProcInputType, ProcOutputType
+from .exceptions import (
+ ProcInputTypeError,
+ ProcOutputNameError,
+ ProcOutputTypeError,
+ TemplateRenderingError,
+)
+from .template import Template
+from .utils import logger, strsplit
+from .pluginmgr import plugin
+
+if TYPE_CHECKING: # pragma: no cover
+ from .proc import Proc
+
+
+class Job(XquteJob, JobCaching):DOCS
+ """The job for pipen"""
+
+ __slots__ = ("proc", "_output_types", "_outdir")
+
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
+ super().__init__(*args, **kwargs)
+ self.proc: Proc = None
+ self._output_types: Dict[str, str] = {}
+ self._outdir = self.metadir / "output"
+
+ @propertyDOCS
+ def script_file(self) -> Path:
+ """Get the path to script file
+
+ Returns:
+ The path to the script file
+ """
+ return self.metadir / "job.script"
+
+ @cached_property
+ def outdir(self) -> Path:
+ """Get the path to the output directory
+
+ Returns:
+ The path to the job output directory
+ """
+ ret = Path(self._outdir)
+ # if ret is a dead link
+ # when switching a proc from end/nonend to nonend/end
+ if ret.is_symlink() and not ret.exists():
+ ret.unlink() # pragma: no cover
+ ret.mkdir(parents=True, exist_ok=True)
+ # If it is somewhere else, make a symbolic link to the metadir
+ metaout = self.metadir / "output"
+ if ret != metaout:
+ if metaout.is_symlink() or metaout.is_file():
+ metaout.unlink()
+ elif metaout.is_dir():
+ shutil.rmtree(metaout)
+ metaout.symlink_to(ret)
+ return ret
+
+ @cached_property
+ def input(self) -> Mapping[str, Any]:
+ """Get the input data for this job
+
+ Returns:
+ A key-value map, where keys are the input keys
+ """
+ import pandas
+
+ ret = self.proc.input.data.iloc[self.index, :].to_dict()
+ # check types
+ for inkey, intype in self.proc.input.type.items():
+
+ if intype == ProcInputType.VAR or ret[inkey] is None:
+ continue # pragma: no cover, covered actually
+
+ if intype in (ProcInputType.FILE, ProcInputType.DIR):
+ if not isinstance(ret[inkey], (str, PathLike)):
+ raise ProcInputTypeError(
+ f"[{self.proc.name}] Got {type(ret[inkey])} instead of "
+ f"PathLike object for input: {inkey + ':' + intype!r}"
+ )
+
+ # we should use it as a string
+ ret[inkey] = plugin.hooks.norm_inpath(
+ self,
+ ret[inkey],
+ intype == ProcInputType.DIR,
+ )
+
+ if intype in (ProcInputType.FILES, ProcInputType.DIRS):
+ if isinstance(ret[inkey], pandas.DataFrame):
+ # // todo: nested dataframe
+ ret[inkey] = ret[inkey].iloc[0, 0] # pragma: no cover
+
+ if not isinstance(ret[inkey], (list, tuple)):
+ raise ProcInputTypeError(
+ f"[{self.proc.name}] Expected a sequence for input: "
+ f"{inkey + ':' + intype!r}, got {type(ret[inkey])}"
+ )
+
+ for i, file in enumerate(ret[inkey]):
+ ret[inkey][i] = plugin.hooks.norm_inpath(
+ self,
+ file,
+ intype == ProcInputType.DIRS,
+ )
+
+ return ret
+
+ @cached_property
+ def output(self) -> Mapping[str, Any]:
+ """Get the output data of the job
+
+ Returns:
+ The key-value map where the keys are the output keys
+ """
+ output_template = self.proc.output
+ if not output_template:
+ return {}
+
+ data = {
+ "job": dict(
+ index=self.index,
+ metadir=str(self.metadir),
+ outdir=str(self.outdir),
+ stdout_file=str(self.stdout_file),
+ stderr_file=str(self.stderr_file),
+ jid_file=str(self.jid_file),
+ ),
+ "in": self.input,
+ "in_": self.input,
+ "proc": self.proc,
+ "envs": self.proc.envs,
+ }
+ try:
+ if isinstance(output_template, Template):
+ # // TODO: check ',' in output value?
+ outputs = strsplit(output_template.render(data), ",")
+ else:
+ outputs = [oput.render(data) for oput in output_template]
+ except Exception as exc:
+ raise TemplateRenderingError(
+ f"[{self.proc.name}] Failed to render output."
+ ) from exc
+
+ ret = OrderedDiot()
+ for oput in outputs:
+ if ":" not in oput:
+ raise ProcOutputNameError(
+ f"[{self.proc.name}] No name given in output."
+ )
+
+ if oput.count(":") == 1:
+ output_name, output_value = oput.split(":")
+ output_type = ProcOutputType.VAR
+ else:
+ output_name, output_type, output_value = oput.split(":", 2)
+ if output_type not in ProcOutputType.__dict__.values():
+ raise ProcOutputTypeError(
+ f"[{self.proc.name}] "
+ f"Unsupported output type: {output_type}"
+ )
+
+ self._output_types[output_name] = output_type
+
+ if output_type == ProcOutputType.VAR:
+ ret[output_name] = output_value
+ else:
+ ret[output_name] = plugin.hooks.norm_outpath(
+ self,
+ output_value,
+ output_type == ProcOutputType.DIR,
+ )
+
+ return ret
+
+ @cached_property
+ def template_data(self) -> Mapping[str, Any]:
+ """Get the data for template rendering
+
+ Returns:
+ The data for template rendering
+ """
+
+ return {
+ "job": dict(
+ index=self.index,
+ metadir=str(self.metadir),
+ outdir=str(self.outdir),
+ stdout_file=str(self.stdout_file),
+ stderr_file=str(self.stderr_file),
+ jid_file=str(self.jid_file),
+ ),
+ "in": self.input,
+ "in_": self.input,
+ "out": self.output,
+ "proc": self.proc,
+ "envs": self.proc.envs,
+ }
+
+ def log(DOCS
+ self,
+ level: int | str,
+ msg: str,
+ *args,
+ limit: int = 3,
+ limit_indicator: bool = True,
+ logger: logging.LoggerAdapter = logger,
+ ) -> None:
+ """Log message for the jobs
+
+ Args:
+ level: The log level of the record
+ msg: The message to log
+ *args: The arguments to format the message
+ limit: limitation of the log (don't log for all jobs)
+ limit_indicator: Whether to show an indicator saying the log
+ has been limited (the level of the indicator will be DEBUG)
+ logger: The logger used to log
+ """
+ if self.index > limit:
+ return
+
+ if self.index == limit:
+ if limit_indicator:
+ msg = f"{msg} (not showing similar logs)"
+
+ job_index_indicator = "[%s/%s] " % (
+ str(self.index).zfill(len(str(self.proc.size - 1))),
+ self.proc.size - 1,
+ )
+
+ self.proc.log(level, job_index_indicator + msg, *args, logger=logger)
+
+ async def prepare(self, proc: Proc) -> None:DOCS
+ """Prepare the job by given process
+
+ Primarily prepare the script, and provide cmd to the job for xqute
+ to wrap and run
+
+ Args:
+ proc: the process object
+ """
+ # Attach the process
+ self.proc = proc
+
+ if self.proc.export and len(self.proc.jobs) == 1:
+ # Don't put index if it is a single-job process
+ self._outdir = Path(self.proc.pipeline.outdir) / self.proc.name
+
+ elif self.proc.export:
+ self._outdir = (
+ Path(self.proc.pipeline.outdir)
+ / self.proc.name
+ / str(self.index)
+ )
+
+ if not proc.script:
+ self.cmd = []
+ return
+
+ template_data = self.template_data
+ try:
+ script = proc.script.render(template_data)
+ except Exception as exc:
+ raise TemplateRenderingError(
+ f"[{self.proc.name}] Failed to render script."
+ ) from exc
+ if (
+ self.script_file.is_file()
+ and await a_read_text(self.script_file) != script
+ ):
+ self.log("debug", "Job script updated.")
+ self.script_file.write_text(script)
+ elif not self.script_file.is_file():
+ self.script_file.write_text(script)
+
+ lang = proc.lang or proc.pipeline.config.lang
+ self.cmd = shlex.split(lang) + [self.script_file] # type: ignore
+
"""Main entry module, provide the Pipen class"""
+from __future__ import annotations
+
+import asyncio
+from os import PathLike
+from pathlib import Path
+from typing import Any, ClassVar, Iterable, List, Sequence, Type
+
+from diot import Diot
+from rich import box
+from rich.panel import Panel
+from rich.text import Text
+from simpleconf import ProfileConfig
+from varname import varname, VarnameException
+
+from .defaults import CONFIG, CONFIG_FILES
+from .exceptions import (
+ PipenOrProcNameError,
+ ProcDependencyError,
+ PipenSetDataError,
+)
+from .pluginmgr import plugin
+from .proc import Proc
+from .progressbar import PipelinePBar
+from .utils import (
+ copy_dict,
+ desc_from_docstring,
+ get_logpanel_width,
+ is_valid_name,
+ log_rich_renderable,
+ logger,
+ pipen_banner,
+)
+
+
+class Pipen:DOCS
+ """The Pipen class provides interface to assemble and run the pipeline
+
+ Attributes:
+ name: The name of the pipeline
+ desc: The description of the pipeline
+ outdir: The output directory of the results
+ procs: The processes
+ pbar: The progress bar
+ starts: The start processes
+ config: The configurations
+ workdir: The workdir for the pipeline
+ profile: The profile of the configurations to run the pipeline
+ _kwargs: The extra configrations passed to overwrite the default ones
+
+ PIPELINE_COUNT: How many pipelines are loaded
+ SETUP: Whether the one-time setup hook is called
+
+ Args:
+ name: The name of the pipeline
+ desc: The description of the pipeline
+ outdir: The output directory of the results
+ **kwargs: Other configurations
+ """
+
+ PIPELINE_COUNT: ClassVar[int] = 0
+ SETUP: ClassVar[bool] = False
+
+ name: str | None = None
+ desc: str | None = None
+ outdir: str | PathLike = None
+ starts: List[Proc] = []
+ data: Iterable | None = None
+ # other configs
+
+ def __init__(
+ self,
+ name: str | None = None,
+ desc: str | None = None,
+ outdir: str | PathLike = None,
+ **kwargs,
+ ) -> None:
+ """Constructor"""
+ self.procs: List[Proc] = None
+ self.pbar: PipelinePBar = None
+ if name is not None:
+ self.name = name
+ elif self.__class__.name is not None:
+ self.name = self.__class__.name
+ else:
+ try:
+ self.name = varname() # type: ignore
+ except VarnameException:
+ if self.__class__.PIPELINE_COUNT == 0:
+ self.name = self.__class__.__name__
+ else:
+ self.name = (
+ f"{self.__class__.__name__}-"
+ f"{self.__class__.PIPELINE_COUNT}"
+ )
+
+ if not is_valid_name(self.name):
+ raise PipenOrProcNameError(
+ fr"Invalid pipeline name: {self.name}, expecting '^[\w.-]$'"
+ )
+
+ self.desc = (
+ desc
+ or self.__class__.desc
+ or desc_from_docstring(self.__class__, Pipen)
+ )
+ self.outdir = Path(
+ outdir or self.__class__.outdir or f"./{self.name}-output"
+ ).resolve()
+ self.workdir: Path = None
+ self.profile: str = "default"
+
+ self.starts: List[Proc] = self.__class__.starts
+ if self.starts and not isinstance(self.starts, (tuple, list)):
+ self.starts = [self.starts]
+
+ self.config = Diot(copy_dict(CONFIG, 3))
+ # We shouldn't update the config here, since we don't know
+ # the profile yet
+ self._kwargs = {
+ key: value
+ for key, value in self.__class__.__dict__.items()
+ if key in self.config
+ }
+ self._kwargs.setdefault("plugin_opts", {}).update(
+ kwargs.pop("plugin_opts", {})
+ )
+ self._kwargs.setdefault("template_opts", {}).update(
+ kwargs.pop("template_opts", {})
+ )
+ self._kwargs.setdefault("scheduler_opts", {}).update(
+ kwargs.pop("scheduler_opts", {})
+ )
+ self._kwargs.update(kwargs)
+ # Initialize the workdir, as workdir is created before _init()
+ # But the config is updated in _init()
+ # Here we hack it to have the workdir passed in.
+ if "workdir" in kwargs:
+ self.config.workdir = kwargs["workdir"]
+
+ if not self.__class__.SETUP: # pragma: no cover
+ # Load plugins from entrypotins at runtime to avoid
+ # cyclic imports
+ plugin.load_entrypoints()
+
+ plugins = self._kwargs.get("plugins", None)
+ if plugins is None:
+ plugins = self.config.plugins
+ self.plugin_context = plugin.plugins_context(plugins)
+ self.plugin_context.__enter__()
+
+ # make sure core plugin is enabled
+ plugin.get_plugin("core").enable()
+
+ if not self.__class__.SETUP: # pragma: no cover
+ plugin.hooks.on_setup(self.config)
+ self.__class__.SETUP = True
+
+ self.__class__.PIPELINE_COUNT += 1
+
+ if self.__class__.data is not None:
+ self.set_data(*self.__class__.data)
+
+ def __init_subclass__(cls) -> None:DOCS
+ cls.PIPELINE_COUNT = 0
+
+ async def async_run(self, profile: str = "default") -> bool:DOCS
+ """Run the processes one by one
+
+ Args:
+ profile: The default profile to use for the run
+
+ Returns:
+ True if the pipeline ends successfully else False
+ """
+ self.profile = profile
+ self.workdir = Path(self.config.workdir) / self.name
+ # self.workdir.mkdir(parents=True, exist_ok=True)
+
+ succeeded = True
+ await self._init()
+ logger.setLevel(self.config.loglevel.upper())
+ log_rich_renderable(pipen_banner(), "magenta", logger.info)
+ try:
+ self.build_proc_relationships()
+ self._log_pipeline_info()
+ await plugin.hooks.on_start(self)
+ for proc in self.procs:
+ self.pbar.update_proc_running()
+ proc_obj = proc(self) # type: ignore
+ if proc in self.starts and proc.input_data is None:
+ proc_obj.log(
+ "warning",
+ "This is a start process, "
+ "but no 'input_data' specified.",
+ )
+ await proc_obj.init()
+ await proc_obj.run()
+ if proc_obj.succeeded:
+ self.pbar.update_proc_done()
+ else:
+ self.pbar.update_proc_error()
+ succeeded = False
+ break
+ proc_obj.gc()
+
+ logger.info("")
+ except Exception:
+ raise
+ else:
+ await plugin.hooks.on_complete(self, succeeded)
+ finally:
+ self.plugin_context.__exit__()
+ if self.pbar:
+ self.pbar.done()
+
+ return succeeded
+
+ def run(DOCS
+ self,
+ profile: str = "default",
+ ) -> bool:
+ """Run the pipeline with the given profile
+ This is just a sync wrapper for the async `async_run` function using
+ `asyncio.run()`
+
+ Args:
+ profile: The default profile to use for the run
+
+ Returns:
+ True if the pipeline ends successfully else False
+ """
+ return asyncio.run(self.async_run(profile))
+
+ def set_data(self, *indata: Any) -> Pipen:DOCS
+ """Set the input_data for start processes
+
+ Args:
+ *indata: The input data for the start processes
+ The data will set for the processes in the order determined by
+ `set_starts()`.
+ If a process has input_data set, an error will be raised.
+ To use that input_data, set None here in the corresponding
+ position for the process
+
+ Raises:
+ ProcInputDataError: When trying to set input data to
+ processes with input_data already set
+
+ Returns:
+ `self` to chain the operations
+ """
+ for start, data in zip(self.starts, indata):
+ if data is None:
+ continue
+ if start.input_data is not None:
+ raise PipenSetDataError(
+ f"`input_data` has already set for {start}. "
+ "If you want to use it, set `None` at the position of "
+ "this process for `Pipen.set_data()`."
+ )
+ start.input_data = data
+ return self
+
+ def set_starts(DOCS
+ self,
+ *procs: Type[Proc] | Sequence[Type[Proc]],
+ clear: bool = True,
+ ):
+ """Set the starts
+
+ Args:
+ *procs: The processes to set as starts of the pipeline.
+ clear: Wether to clear previous set starts
+
+ Raises:
+ ProcDependencyError: When processes set as starts repeatedly
+
+ Returns:
+ `self` to chain the operations
+ """
+ if clear:
+ self.starts = []
+ self.procs = None
+
+ for proc in procs:
+ if isinstance(proc, (list, tuple)):
+ self.set_starts(*proc, clear=False)
+ elif not isinstance(proc, type) or not issubclass(proc, Proc):
+ raise ProcDependencyError(
+ f"{proc!r} is not a subclass of 'pipen.Proc'."
+ )
+ elif proc not in self.starts:
+ self.starts.append(proc) # type: ignore
+ else:
+ raise ProcDependencyError(
+ f"{proc} is already a start process."
+ )
+ return self
+
+ # In case people forget the "s"
+ set_start = set_starts
+
+ def _log_pipeline_info(self) -> None:
+ """Print the information of the pipeline"""
+ logger.info("")
+ # Pipeline line and description
+ log_rich_renderable(
+ Panel(
+ self.desc or Text(self.name.upper(), justify="center"),
+ width=get_logpanel_width(),
+ # padding=(0, 1),
+ box=box.DOUBLE_EDGE,
+ title=self.name.upper() if self.desc else None,
+ ),
+ "magenta",
+ logger.info,
+ )
+ fmt = "[bold][magenta]%-16s:[/magenta][/bold] %s"
+ enabled_plugins = (
+ "{name} [cyan]{version}[/cyan]".format(
+ name=name,
+ version=(f"v{plg.version}" if plg.version else ""),
+ )
+ for name, plg in plugin.get_enabled_plugins().items()
+ if name != "core"
+ )
+ for i, plug in enumerate(enabled_plugins):
+ logger.info(fmt, "plugins" if i == 0 else "", plug)
+ logger.info(fmt, "# procs", len(self.procs))
+ logger.info(fmt, "profile", self.profile)
+ logger.info(fmt, "outdir", self.outdir)
+ logger.info(fmt, "cache", self.config.cache)
+ logger.info(fmt, "dirsig", self.config.dirsig)
+ logger.info(fmt, "error_strategy", self.config.error_strategy)
+ logger.info(fmt, "forks", self.config.forks)
+ logger.info(fmt, "lang", self.config.lang)
+ logger.info(fmt, "loglevel", self.config.loglevel)
+ logger.info(fmt, "num_retries", self.config.num_retries)
+ logger.info(fmt, "scheduler", self.config.scheduler)
+ logger.info(fmt, "submission_batch", self.config.submission_batch)
+ logger.info(fmt, "template", self.config.template)
+ logger.info(fmt, "workdir", self.workdir)
+ for i, (key, val) in enumerate(self.config.plugin_opts.items()):
+ logger.info(fmt, "plugin_opts" if i == 0 else "", f"{key}={val}")
+ for i, (key, val) in enumerate(self.config.scheduler_opts.items()):
+ logger.info(
+ fmt, "scheduler_opts" if i == 0 else "", f"{key}={val}"
+ )
+ for i, (key, val) in enumerate(self.config.template_opts.items()):
+ logger.info(fmt, "template_opts" if i == 0 else "", f"{key}={val}")
+
+ async def _init(self) -> None:
+ """Compute the configurations for the pipeline based on the priorities
+
+ Configurations (priority from low to high)
+ 1. The default config in .defaults
+ 2. The plugin_opts defined in plugins (via on_setup() hook)
+ (see __init__())
+ 3. Configuration files
+ 4. **kwargs from Pipen(..., **kwargs)
+ 5. Those defined in each Proc class
+ """
+ # Then load the configurations from config files
+ config = ProfileConfig.load(
+ {"default": self.config},
+ *CONFIG_FILES,
+ ignore_nonexist=True,
+ )
+ self.config = ProfileConfig.use_profile(
+ config, self.profile, copy=True
+ )
+
+ # configs from files and CONFIG are loaded
+ # allow plugins to change the default configs
+ await plugin.hooks.on_init(self)
+ self.workdir.mkdir(parents=True, exist_ok=True)
+ # Then load the extra configurations passed from __init__(**kwargs)
+ # Make sure dict options get inherited
+ self.config.template_opts.update(self._kwargs.pop("template_opts", {}))
+ self.config.scheduler_opts.update(
+ self._kwargs.pop("scheduler_opts", {})
+ )
+ self.config.plugin_opts.update(self._kwargs.pop("plugin_opts", {}))
+ self.config.update(self._kwargs)
+
+ def build_proc_relationships(self) -> None:DOCS
+ """Build the proc relationships for the pipeline"""
+ if self.procs:
+ return
+
+ if not self.starts:
+ raise ProcDependencyError(
+ "No start processes specified. "
+ "Did you forget to call `Pipen.set_starts()`?"
+ )
+
+ # build proc relationships
+ # Allow starts to be set as a tuple
+ self.procs = list(self.starts)
+ nexts = set(
+ sum((proc.nexts or [] for proc in self.procs), []) # type: ignore
+ )
+ logger.debug("")
+ logger.debug("Building process relationships:")
+ logger.debug("- Start processes: %s", self.procs)
+ while nexts:
+ logger.debug("- Next processes: %s", nexts)
+ # pick up one that can be added to procs
+ for proc in sorted(
+ nexts, key=lambda prc: (prc.order or 0, prc.name)
+ ):
+ if proc in self.procs:
+ raise ProcDependencyError(
+ f"Cyclic dependency: {proc.name}"
+ )
+
+ if proc.name in [p.name for p in self.procs]:
+ raise PipenOrProcNameError(
+ f"'{proc.name}' is already used by another process."
+ )
+
+ # Add proc to self.procs if all their requires
+ # are added to self.procs
+ # Then remove proc from nexts
+ # If there are still procs in nexts
+ # meaning some requires of those procs cannot run before
+ # those procs.
+ if not set(proc.requires) - set(self.procs): # type: ignore
+ self.procs.append(proc) # type: ignore
+ nexts.remove(proc)
+ nexts |= set(proc.nexts or ())
+ break
+ else:
+ if nexts:
+ raise ProcDependencyError(
+ f"No available next processes for {nexts}. "
+ "Did you forget to start with their "
+ "required processes?"
+ )
+
+ self.pbar = PipelinePBar(len(self.procs), self.name.upper())
+
+
+def run(DOCS
+ name: str,
+ starts: Type[Proc] | List[Type[Proc]],
+ data: Iterable = None,
+ *,
+ desc: str = None,
+ outdir: str | PathLike = None,
+ profile: str = "default",
+ **kwargs,
+) -> bool:
+ """Shortcut to run a pipeline
+
+ Args:
+ name: The name of the pipeline
+ starts: The start processes
+ data: The input data for the start processes
+ desc: The description of the pipeline
+ outdir: The output directory of the results
+ profile: The profile to use
+ **kwargs: Other options pass to Pipen to create the pipeline
+
+ Returns:
+ True if the pipeline ends successfully else False
+ """
+ pipeline = Pipen(
+ name=name,
+ desc=desc,
+ outdir=outdir,
+ **kwargs,
+ )
+ pipeline.set_starts(starts).set_data(data)
+ return pipeline.run(profile)
+
"""Define hooks specifications and provide plugin manager"""
+from __future__ import annotations
+
+import shutil
+from os import PathLike
+from pathlib import Path
+from typing import Any, Dict, TYPE_CHECKING
+
+from simplug import Simplug, SimplugResult, makecall
+from xqute import JobStatus, Scheduler
+from xqute.utils import a_read_text, a_write_text, asyncify
+
+from .defaults import ProcOutputType
+from .exceptions import ProcInputValueError, ProcOutputValueError
+from .utils import get_mtime as _get_mtime
+
+
+if TYPE_CHECKING: # pragma: no cover
+ import signal
+ from simplug import SimplugImplCall
+ from xqute import Xqute
+ from .job import Job
+ from .proc import Proc
+ from .pipen import Pipen
+
+plugin = Simplug("pipen")
+
+
+@plugin.specDOCS
+def on_setup(config: Dict[str, Any]) -> None:
+ """Setup for plugins, primarily used for the plugins to
+ setup some default configurations.
+
+ This is only called once for all pipelines.
+
+ Args:
+ config: The configuration dictionary
+ plugin options should be defined under "plugin_opts"
+ One should define a configuration item either with a prefix as
+ the identity for the plugin or a namespace inside the plugin config
+ """
+
+
+@plugin.specDOCS
+async def on_init(pipen: Pipen) -> None:
+ """When the pipeline is initialized, and default configs are loaded
+
+ Args:
+ pipen: The Pipen object
+ """
+
+
+@plugin.specDOCS
+async def on_start(pipen: Pipen) -> None:
+ """Right before the pipeline starts running.
+
+ Process relationships are inferred.
+
+ Args:
+ pipen: The Pipen object
+ """
+
+
+@plugin.specDOCS
+async def on_complete(pipen: Pipen, succeeded: bool):
+ """The the pipeline is completed.
+
+ Args:
+ pipen: The Pipen object
+ succeeded: Whether the pipeline has successfully completed.
+ """
+
+
+@plugin.specDOCS
+def on_proc_create(proc: Proc):
+ """Called Proc constructor when a process is created.
+
+ Enables plugins to modify the default attributes of processes
+
+ Args:
+ proc: The Proc object
+ """
+
+
+@plugin.specDOCS
+async def on_proc_init(proc: Proc):
+ """Called when a process is initialized.
+
+ Allows plugins to modify the process attributes after initialization, but
+ before the jobs are initialized.
+
+ Args:
+ proc: The Proc object
+ """
+
+
+@plugin.specDOCS
+def on_proc_input_computed(proc: Proc):
+ """Called after process input data is computed.
+
+ Args:
+ proc: The Proc object
+ """
+
+
+@plugin.specDOCS
+def on_proc_script_computed(proc: Proc):
+ """Called after process script is computed.
+
+ The script is computed as a string that is about to compiled into a
+ template.
+
+ Args:
+ proc: The Proc object
+ """
+
+
+@plugin.specDOCS
+async def on_proc_start(proc: Proc):
+ """When a process is starting
+
+ Args:
+ proc: The process
+ """
+
+
+@plugin.spec(result=SimplugResult.TRY_ALL_FIRST_AVAIL)DOCS
+def on_proc_shutdown(proc: Proc, sig: signal.Signals) -> None:
+ """When pipeline is shutting down, by Ctrl-c for example.
+
+ Return False to stop shutting down, but you have to shut it down
+ by yourself, for example, `proc.xqute.task.cancel()`
+
+ Only the first return value will be used.
+
+ Args:
+ pipen: The xqute object
+ sig: The signal. `None` means a natural shutdown
+ """
+
+
+@plugin.specDOCS
+async def on_proc_done(proc: Proc, succeeded: bool | str) -> None:
+ """When a process is done
+
+ Args:
+ proc: The process
+ succeeded: Whether the process succeeded or not. 'cached' if all jobs
+ are cached.
+ """
+
+
+@plugin.specDOCS
+async def on_job_init(job: Job):
+ """When a job is initialized
+
+ Args:
+ job: The job
+ """
+
+
+@plugin.specDOCS
+async def on_job_queued(job: Job):
+ """When a job is queued in xqute. Note it might not be queued yet in
+ the scheduler system.
+
+ Args:
+ job: The job
+ """
+
+
+@plugin.spec(result=SimplugResult.TRY_ALL_FIRST_AVAIL)DOCS
+async def on_job_submitting(job: Job) -> bool:
+ """When a job is submitting.
+
+ The first plugin (based on priority) have this hook return False will
+ cancel the submission
+
+ Args:
+ job: The job
+
+ Returns:
+ False to cancel submission
+ """
+
+
+@plugin.specDOCS
+async def on_job_submitted(job: Job):
+ """When a job is submitted in the scheduler system.
+
+ Args:
+ job: The job
+ """
+
+
+@plugin.specDOCS
+async def on_job_started(job: Job):
+ """When a job starts to run in then scheduler system.
+
+ Note that the job might not be running yet in the scheduler system.
+
+ Args:
+ job: The job
+ """
+
+
+@plugin.specDOCS
+async def on_job_polling(job: Job):
+ """When status of a job is being polled.
+
+ Args:
+ job: The job
+ """
+
+
+@plugin.spec(result=SimplugResult.TRY_ALL_FIRST_AVAIL)DOCS
+async def on_job_killing(job: Job) -> bool:
+ """When a job is being killed.
+
+ The first plugin (based on priority) have this hook return False will
+ cancel the killing
+
+ Args:
+ job: The job
+
+ Returns:
+ False to cancel killing
+ """
+
+
+@plugin.specDOCS
+async def on_job_killed(job: Job):
+ """When a job is killed
+
+ Args:
+ job: The job
+ """
+
+
+@plugin.specDOCS
+async def on_job_succeeded(job: Job):
+ """When a job completes successfully.
+
+ Args:
+ job: The job
+ """
+
+
+@plugin.specDOCS
+async def on_job_cached(job: Job):
+ """When a job is cached.
+
+ Args:
+ job: The job
+ """
+
+
+@plugin.specDOCS
+async def on_job_failed(job: Job):
+ """When a job is done but failed.
+
+ Args:
+ job: The job
+ """
+
+
+def _collect_norm_inpath(calls: list[SimplugImplCall]) -> str:
+ for call in calls:
+ out = makecall(call)
+ if out is not None:
+ return str(out)
+
+ from .job import Job
+ # The first argument could be self in implementation
+ idx = 0 if isinstance(calls[0].args[0], Job) else 1
+ job = calls[0].kwargs.pop("job", calls[0].args[idx])
+ inpath = calls[0].kwargs.pop("inpath", calls[0].args[idx + 1])
+ raise ProcInputValueError(
+ f"[{job.proc.name}] Unsupported protocol for input path: "
+ f"{inpath.split('://')[0]}://"
+ )
+
+
+@plugin.spec(result=_collect_norm_inpath)DOCS
+def norm_inpath(job: Job, inpath: str | PathLike, is_dir: bool) -> str:
+ """Normalize the input path
+
+ Args:
+ job: The job
+ inpath: The input path
+ is_dir: Whether the path is a directory
+
+ Returns:
+ The normalized path
+ """
+
+
+def _collect_norm_outpath(calls: list[SimplugImplCall]) -> str:
+ for call in calls:
+ out = makecall(call)
+ if out is not None:
+ return str(out)
+
+ from .job import Job
+ # The first argument could be self in implementation
+ idx = 0 if isinstance(calls[0].args[0], Job) else 1
+ job = calls[0].kwargs.pop("job", calls[0].args[idx])
+ outpath = calls[0].kwargs.pop("outpath", calls[0].args[idx + 1])
+ raise ProcOutputValueError(
+ f"[{job.proc.name}] Unsupported protocol for output path: "
+ f"{outpath.split('://')[0]}://"
+ )
+
+
+@plugin.spec(result=_collect_norm_outpath)DOCS
+def norm_outpath(job: Job, outpath: str, is_dir: bool) -> str:
+ """Normalize the output path
+
+ Args:
+ job: The job
+ outpath: The output path
+ is_dir: Whether the path is a directory
+
+ Returns:
+ The normalized path
+ """
+
+
+def _collect_get_mtime(calls: list[SimplugImplCall]) -> float:
+ for call in calls:
+ out = makecall(call)
+ if out is not None:
+ return float(out)
+
+ from .job import Job
+ # The first argument could be self in implementation
+ idx = 0 if isinstance(calls[0].args[0], Job) else 1
+ job = calls[0].kwargs.pop("job", calls[0].args[idx])
+ path = calls[0].kwargs.pop("path", calls[0].args[idx + 1])
+ raise NotImplementedError(
+ f"[{job.proc.name}] Unsupported protocol in path to get mtime: "
+ f"{path.split('://')[0]}://"
+ )
+
+
+@plugin.spec(result=_collect_get_mtime)DOCS
+def get_mtime(job: Job, path: str | PathLike, dirsig: int) -> float:
+ """Get the mtime of a path, either a file or a directory
+
+ Args:
+ job: The job
+ path: The path to get mtime
+ dirsig: The depth of the directory to check the last modification time
+
+ Returns:
+ The last modification time
+ """
+
+
+async def _collect_clear_path(calls: list[SimplugImplCall]) -> bool:
+ for call in calls:
+ out = await makecall(call)
+ if out is not None:
+ return out
+
+ from .job import Job
+ # The first argument could be self in implementation
+ idx = 0 if isinstance(calls[0].args[0], Job) else 1
+ job = calls[0].kwargs.pop("job", calls[0].args[idx])
+ path = calls[0].kwargs.pop("path", calls[0].args[idx + 1])
+ raise NotImplementedError(
+ f"[{job.proc.name}] Unsupported protocol in path to clear: "
+ f"{path.split('://')[0]}://"
+ )
+
+
+@plugin.spec(result=_collect_clear_path)DOCS
+async def clear_path(job: Job, path: str | PathLike, is_dir: bool) -> bool:
+ """Clear the path, either a file or a directory
+
+ Args:
+ job: The job
+ path: The path to clear
+ is_dir: Whether the path is a directory
+
+ Returns:
+ Whether the path is cleared successfully
+ """
+
+
+async def _collect_output_exists(calls: list[SimplugImplCall]) -> bool:
+ for call in calls:
+ out = await makecall(call)
+ if out is not None:
+ return out
+
+ from .job import Job
+ # The first argument could be self in implementation
+ idx = 0 if isinstance(calls[0].args[0], Job) else 1
+ job = calls[0].kwargs.pop("job", calls[0].args[idx])
+ path = calls[0].kwargs.pop("path", calls[0].args[idx + 1])
+ raise NotImplementedError(
+ f"[{job.proc.name}] Unsupported protocol in path to test existence: "
+ f"{path.split('://')[0]}://"
+ )
+
+
+@plugin.spec(result=_collect_output_exists)DOCS
+async def output_exists(job: Job, path: str, is_dir: bool) -> bool:
+ """Check if the output exists
+
+ Args:
+ job: The job
+ path: The path to check
+ is_dir: Whether the path is a directory
+
+ Returns:
+ Whether the output exists
+ """
+
+
+@plugin.spec(result=SimplugResult.ALL_AVAILS)DOCS
+def on_jobcmd_init(job: Job) -> str:
+ """When the job command wrapper script is initialized before the prescript is run
+
+ This should return a piece of bash code to be inserted in the wrapped job
+ script (template), which is a python template string, with the following
+ variables available: `status` and `job`. `status` is the class `JobStatus` from
+ `xqute.defaults.py` and `job` is the `Job` instance.
+
+ For multiple plugins, the code will be inserted in the order of the plugin priority.
+
+ The code will replace the `#![jobcmd_init]` placeholder in the wrapped job script.
+ See also <https://github.com/pwwang/xqute/blob/master/xqute/defaults.py#L95>
+
+ Args:
+ job: The job object
+
+ Returns:
+ The bash code to be inserted
+ """
+
+
+@plugin.spec(result=SimplugResult.ALL_AVAILS)DOCS
+def on_jobcmd_prep(job: Job) -> str:
+ """When the job command right about to be run
+
+ This should return a piece of bash code to be inserted in the wrapped job
+ script (template), which is a python template string, with the following
+ variables available: `status` and `job`. `status` is the class `JobStatus` from
+ `xqute.defaults.py` and `job` is the `Job` instance.
+
+ The bash variable `$cmd` is accessible in the context. It is also possible to
+ modify the `cmd` variable. Just remember to assign the modified value to `cmd`.
+
+ For multiple plugins, the code will be inserted in the order of the plugin priority.
+ Keep in mind that the `$cmd` may be modified by other plugins.
+
+ The code will replace the `#![jobcmd_prep]` placeholder in the wrapped job script.
+ See also <https://github.com/pwwang/xqute/blob/master/xqute/defaults.py#L95>
+
+ Args:
+ job: The job object
+
+ Returns:
+ The bash code to be inserted
+ """
+
+
+@plugin.spec(result=SimplugResult.ALL_AVAILS)DOCS
+def on_jobcmd_end(job: Job) -> str:
+ """When the job command finishes and after the postscript is run
+
+ This should return a piece of bash code to be inserted in the wrapped job
+ script (template), which is a python template string, with the following
+ variables available: `status` and `job`. `status` is the class `JobStatus` from
+ `xqute.defaults.py` and `job` is the `Job` instance.
+
+ The bash variable `$rc` is accessible in the context, which is the return code
+ of the job command.
+
+ For multiple plugins, the code will be inserted in the order of the plugin priority.
+
+ The code will replace the `#![jobcmd_end]` placeholder in the wrapped job script.
+ See also <https://github.com/pwwang/xqute/blob/master/xqute/defaults.py#L95>
+
+ Args:
+ job: The job object
+
+ Returns:
+ The bash code to be inserted
+ """
+
+
+class PipenMainPlugin:DOCS
+ """The builtin core plugin, used to update the progress bar and
+ cache the job"""
+
+ name = "core"
+ # The priority is set to -1000 to make sure it is the first plugin
+ # to be called
+ priority = -1000
+
+ @plugin.impl
+ def on_proc_shutdown(self, proc: Proc, sig: signal.Signals):
+ """When a process is shutting down"""
+ if sig: # pragma: no cover
+ proc.log(
+ "warning",
+ "Got signal %r, trying a graceful shutdown ...",
+ sig.name,
+ )
+
+ @plugin.impl
+ async def on_job_submitted(self, job: Job):
+ """Update the progress bar when a job is submitted"""
+ job.proc.pbar.update_job_submitted()
+
+ @plugin.impl
+ async def on_job_started(self, job: Job):
+ """Update the progress bar when a job starts to run"""
+ job.proc.pbar.update_job_running()
+
+ @plugin.impl
+ async def on_job_cached(self, job: Job):
+ """Update the progress bar when a job is cached"""
+ job.proc.pbar.update_job_submitted()
+ job.proc.pbar.update_job_running()
+ job.proc.pbar.update_job_succeeded()
+ job.status = JobStatus.FINISHED
+
+ @plugin.impl
+ async def on_job_succeeded(self, job: Job):
+ """Cache the job and update the progress bar when a job is succeeded"""
+ # now the returncode is 0, however, we need to check if output files
+ # have been created or not, this makes sure job.cache not fail
+ for outkey, outtype in job._output_types.items():
+ if outtype == ProcOutputType.VAR:
+ continue
+
+ output_exists = await plugin.hooks.output_exists(
+ job,
+ job.output[outkey],
+ outtype == ProcOutputType.DIR,
+ )
+ if not output_exists:
+ job.status = JobStatus.FAILED
+ job.proc.pbar.update_job_failed()
+ stderr = await a_read_text(job.stderr_file)
+ stderr = (
+ f"{stderr}\n\nOutput {outtype} {outkey!r} "
+ "is not generated."
+ )
+ await a_write_text(job.stderr_file, stderr)
+ break
+ else:
+ await job.cache()
+ job.proc.pbar.update_job_succeeded()
+
+ @plugin.impl
+ async def on_job_failed(self, job: Job):
+ """Update the progress bar when a job is failed"""
+ job.proc.pbar.update_job_failed()
+ if job.status == JobStatus.RETRYING:
+ job.log("debug", "Retrying #%s", job.trial_count + 1)
+ job.proc.pbar.update_job_retrying()
+
+ @plugin.impl
+ async def on_job_killed(self, job: Job):
+ """Update the status of a killed job"""
+ # instead of FINISHED to force the whole pipeline to quit
+ job.status = JobStatus.FAILED # pragma: no cover
+
+ @plugin.impl
+ def norm_inpath(
+ self,
+ job: Job,
+ inpath: str | PathLike,
+ is_dir: bool,
+ ) -> str:
+ """Normalize the input path"""
+ if "://" in str(inpath):
+ # Let the plugins handle the protocol
+ return None
+
+ return str(Path(inpath).expanduser().resolve())
+
+ @plugin.impl
+ def norm_outpath(
+ self,
+ job: Job,
+ outpath: str,
+ is_dir: bool,
+ ) -> str:
+ """Normalize the output path"""
+ if "://" in outpath:
+ # Let the plugins handle the protocol
+ return None
+
+ if Path(outpath).is_absolute():
+ raise ProcOutputValueError(
+ f"[{job.proc.name}] Process output should be a relative path: {outpath}"
+ )
+
+ out = job.outdir.resolve() / outpath
+ if is_dir:
+ out.mkdir(parents=True, exist_ok=True)
+
+ return str(out)
+
+ @plugin.impl
+ def get_mtime(
+ self,
+ job: Job,
+ path: str | PathLike,
+ dirsig: int,
+ ):
+ """Get the mtime of a path"""
+ if "://" in str(path):
+ # Let the plugins handle the protocol
+ return None
+
+ return _get_mtime(path, dirsig)
+
+ @plugin.impl
+ async def clear_path(self, job: Job, path: str | PathLike, is_dir: bool):
+ """Clear the path"""
+ if "://" in str(path):
+ # Let the plugins handle the protocol
+ return None
+
+ path = Path(path)
+ try:
+ # dead link
+ if not path.exists():
+ if path.is_symlink():
+ await asyncify(Path.unlink)(path)
+
+ elif not is_dir:
+ await asyncify(Path.unlink)(path)
+
+ else:
+ await asyncify(shutil.rmtree)(path)
+ path.mkdir()
+ except Exception: # pragma: no cover
+ return False
+ return True
+
+ @plugin.impl
+ async def output_exists(self, job: Job, path: str, is_dir: bool):
+ """Check if the output exists"""
+ if "://" in path:
+ # Let the plugins handle the protocol
+ return None
+
+ path = Path(path)
+ if not path.exists():
+ return False
+ if is_dir:
+ return len(list(path.iterdir())) > 0 # pragma: no cover
+ return True
+
+
+plugin.register(PipenMainPlugin)
+
+xqute_plugin = Simplug("xqute")
+
+
+class XqutePipenPlugin:DOCS
+ """The plugin for xqute working as proxy for pipen plugin hooks"""
+
+ name = "xqute.pipen"
+
+ @xqute_plugin.impl
+ def on_shutdown(self, xqute: Xqute, sig: signal.Signals):
+ """When a process is shutting down"""
+ return plugin.hooks.on_proc_shutdown(xqute.proc, sig)
+
+ @xqute_plugin.impl
+ async def on_job_init(self, scheduler: Scheduler, job: Job):
+ """When a job is initialized"""
+ await plugin.hooks.on_job_init(job)
+
+ @xqute_plugin.impl
+ async def on_job_queued(self, scheduler: Scheduler, job: Job):
+ """When a job is queued"""
+ await plugin.hooks.on_job_queued(job)
+
+ @xqute_plugin.impl
+ async def on_job_submitting(self, scheduler: Scheduler, job: Job):
+ """When a job is being submitted"""
+ return await plugin.hooks.on_job_submitting(job)
+
+ @xqute_plugin.impl
+ async def on_job_submitted(self, scheduler: Scheduler, job: Job):
+ """When a job is submitted"""
+ await plugin.hooks.on_job_submitted(job)
+
+ @xqute_plugin.impl
+ async def on_job_started(self, scheduler: Scheduler, job: Job):
+ """When a job starts to run"""
+ await plugin.hooks.on_job_started(job)
+
+ @xqute_plugin.impl
+ async def on_job_polling(self, scheduler: Scheduler, job: Job):
+ """When a job starts to run"""
+ await plugin.hooks.on_job_polling(job)
+
+ @xqute_plugin.impl
+ async def on_job_killing(self, scheduler: Scheduler, job: Job):
+ """When a job is being killed"""
+ return await plugin.hooks.on_job_killing(job) # pragma: no cover
+
+ @xqute_plugin.impl
+ async def on_job_killed(self, scheduler: Scheduler, job: Job):
+ """When a job is killed"""
+ await plugin.hooks.on_job_killed(job) # pragma: no cover
+
+ @xqute_plugin.impl
+ async def on_job_succeeded(self, scheduler: Scheduler, job: Job):
+ """When a job is succeeded"""
+ await plugin.hooks.on_job_succeeded(job)
+
+ @xqute_plugin.impl
+ async def on_job_failed(self, scheduler: Scheduler, job: Job):
+ """When a job is failed"""
+ await plugin.hooks.on_job_failed(job)
+
+ @xqute_plugin.impl
+ def on_jobcmd_init(self, scheduler: Scheduler, job: Job):
+ """When the job command wrapper script is initialized"""
+ codes = plugin.hooks.on_jobcmd_init(job)
+ if not codes:
+ return None
+ return "\n\n".join(codes)
+
+ @xqute_plugin.impl
+ def on_jobcmd_prep(self, scheduler: Scheduler, job: Job):
+ """When the job command is about to be run"""
+ codes = plugin.hooks.on_jobcmd_prep(job)
+ if not codes:
+ return None
+ return "\n\n".join(codes)
+
+ @xqute_plugin.impl
+ def on_jobcmd_end(self, scheduler: Scheduler, job: Job):
+ """When the job command finishes"""
+ codes = plugin.hooks.on_jobcmd_end(job)
+ if not codes:
+ return None
+ return "\n\n".join(codes)
+
+
+xqute_plugin.register(XqutePipenPlugin)
+
"""Provides the process class: Proc"""
+from __future__ import annotations
+
+import asyncio
+import inspect
+import logging
+from abc import ABC, ABCMeta
+from functools import cached_property
+from os import PathLike
+from pathlib import Path
+from typing import (
+ Any,
+ Dict,
+ List,
+ Mapping,
+ Sequence,
+ Type,
+ TYPE_CHECKING,
+)
+
+from diot import Diot
+from rich import box
+from rich.panel import Panel
+from varname import VarnameException, varname
+from xqute import JobStatus, Xqute
+
+from .defaults import ProcInputType
+from .exceptions import (
+ ProcInputKeyError,
+ ProcInputTypeError,
+ ProcScriptFileNotFound,
+ PipenOrProcNameError,
+)
+from .pluginmgr import plugin
+from .scheduler import get_scheduler
+from .template import Template, get_template_engine
+from .utils import (
+ brief_list,
+ copy_dict,
+ desc_from_docstring,
+ get_logpanel_width,
+ ignore_firstline_dedent,
+ is_subclass,
+ is_valid_name,
+ log_rich_renderable,
+ logger,
+ make_df_colnames_unique_inplace,
+ strsplit,
+ update_dict,
+ get_shebang,
+ get_base,
+)
+
+if TYPE_CHECKING: # pragma: no cover
+ from .pipen import Pipen
+
+
+class ProcMeta(ABCMeta):DOCS
+ """Meta class for Proc"""
+
+ _INSTANCES: Dict[Type, Proc] = {}
+
+ def __repr__(cls) -> str:DOCS
+ """Representation for the Proc subclasses"""
+ return f"<Proc:{cls.name}>"
+
+ def __setattr__(cls, name: str, value: Any) -> None:
+ if name == "requires":
+ value = cls._compute_requires(value)
+ return super().__setattr__(name, value)
+
+ def __call__(cls, *args: Any, **kwds: Any) -> Proc:DOCS
+ """Make sure Proc subclasses are singletons
+
+ Args:
+ *args: and
+ **kwds: Arguments for the constructor
+
+ Returns:
+ The Proc instance
+ """
+ if cls not in cls._INSTANCES:
+ cls._INSTANCES[cls] = super().__call__(*args, **kwds)
+
+ return cls._INSTANCES[cls]
+
+
+class Proc(ABC, metaclass=ProcMeta):DOCS
+
+ """The abstract class for processes.
+
+ It's an abstract class. You can't instantise a process using it directly.
+ You have to subclass it. The subclass itself can be used as a process
+ directly.
+
+ Each subclass is a singleton, so to intantise a new process, each subclass
+ an existing `Proc` subclass, or use `Proc.from_proc()`.
+
+ Never use the constructor directly. The Proc is designed
+ as a singleton class, and is instansiated internally.
+
+ Attributes:
+ name: The name of the process. Will use the class name by default.
+ desc: The description of the process. Will use the summary from
+ the docstring by default.
+ envs: The arguments that are job-independent, useful for common options
+ across jobs.
+ envs_depth: How deep to update the envs when subclassed.
+ cache: Should we detect whether the jobs are cached?
+ dirsig: When checking the signature for caching, whether should we walk
+ through the content of the directory? This is sometimes
+ time-consuming if the directory is big.
+ export: When True, the results will be exported to `<pipeline.outdir>`
+ Defaults to None, meaning only end processes will export.
+ You can set it to True/False to enable or disable exporting
+ for processes
+ error_strategy: How to deal with the errors
+ - retry, ignore, halt
+ - halt to halt the whole pipeline, no submitting new jobs
+ - terminate to just terminate the job itself
+ num_retries: How many times to retry to jobs once error occurs
+ template: Define the template engine to use.
+ This could be either a template engine or a dict with key `engine`
+ indicating the template engine and the rest the arguments passed
+ to the constructor of the `pipen.template.Template` object.
+ The template engine could be either the name of the engine,
+ currently jinja2 and liquidpy are supported, or a subclass of
+ `pipen.template.Template`.
+ You can subclass `pipen.template.Template` to use your own template
+ engine.
+ forks: How many jobs to run simultaneously?
+ input: The keys for the input channel
+ input_data: The input data (will be computed for dependent processes)
+ lang: The language for the script to run. Should be the path to the
+ interpreter if `lang` is not in `$PATH`.
+ order: The execution order for this process. The bigger the number
+ is, the later the process will be executed. Default: 0.
+ Note that the dependent processes will always be executed first.
+ This doesn't work for start processes either, whose orders are
+ determined by `Pipen.set_starts()`
+ output: The output keys for the output channel
+ (the data will be computed)
+ plugin_opts: Options for process-level plugins
+ requires: The dependency processes
+ scheduler: The scheduler to run the jobs
+ scheduler_opts: The options for the scheduler
+ script: The script template for the process
+ submission_batch: How many jobs to be submited simultaneously
+
+ nexts: Computed from `requires` to build the process relationships
+ output_data: The output data (to pass to the next processes)
+ """
+
+ name: str = None
+ desc: str = None
+ envs: Mapping[str, Any] = None
+ envs_depth: int = None
+ cache: bool = None
+ dirsig: bool = None
+ export: bool = None
+ error_strategy: str = None
+ num_retries: int = None
+ template: str | Type[Template] = None
+ template_opts: Mapping[str, Any] = None
+ forks: int = None
+ input: str | Sequence[str] = None
+ input_data: Any = None
+ lang: str = None
+ order: int = None
+ output: str | Sequence[str] = None
+ plugin_opts: Mapping[str, Any] = None
+ requires: Type[Proc] | Sequence[Type[Proc]] = None
+ scheduler: str = None
+ scheduler_opts: Mapping[str, Any] = None
+ script: str = None
+ submission_batch: int = None
+
+ nexts: Sequence[Type[Proc]] = None
+ output_data: Any = None
+ workdir: PathLike = None
+ # metadata that marks the process
+ # Can also be used for plugins
+ # It's not inheirted
+ __meta__: Mapping[str, Any] = None
+
+ @classmethodDOCS
+ def from_proc(
+ cls,
+ proc: Type[Proc],
+ name: str = None,
+ desc: str = None,
+ envs: Mapping[str, Any] = None,
+ envs_depth: int = None,
+ cache: bool = None,
+ export: bool = None,
+ error_strategy: str = None,
+ num_retries: int = None,
+ forks: int = None,
+ input_data: Any = None,
+ order: int = None,
+ plugin_opts: Mapping[str, Any] = None,
+ requires: Sequence[Type[Proc]] = None,
+ scheduler: str = None,
+ scheduler_opts: Mapping[str, Any] = None,
+ submission_batch: int = None,
+ ) -> Type[Proc]:
+ """Create a subclass of Proc using another Proc subclass or Proc itself
+
+ Args:
+ proc: The Proc subclass
+ name: The new name of the process
+ desc: The new description of the process
+ envs: The arguments of the process, will overwrite parent one
+ The items that are specified will be inherited
+ envs_depth: How deep to update the envs when subclassed.
+ cache: Whether we should check the cache for the jobs
+ export: When True, the results will be exported to
+ `<pipeline.outdir>`
+ Defaults to None, meaning only end processes will export.
+ You can set it to True/False to enable or disable exporting
+ for processes
+ error_strategy: How to deal with the errors
+ - retry, ignore, halt
+ - halt to halt the whole pipeline, no submitting new jobs
+ - terminate to just terminate the job itself
+ num_retries: How many times to retry to jobs once error occurs
+ forks: New forks for the new process
+ input_data: The input data for the process. Only when this process
+ is a start process
+ order: The order to execute the new process
+ plugin_opts: The new plugin options, unspecified items will be
+ inherited.
+ requires: The required processes for the new process
+ scheduler: The new shedular to run the new process
+ scheduler_opts: The new scheduler options, unspecified items will
+ be inherited.
+ submission_batch: How many jobs to be submited simultaneously
+
+ Returns:
+ The new process class
+ """
+ if not name:
+ try:
+ name = varname() # type: ignore
+ except VarnameException as vexc: # pragma: no cover
+ raise ValueError(
+ "Process name cannot be detected from assignment, "
+ "pass one explicitly to `Proc.from_proc(..., name=...)`"
+ ) from vexc
+
+ kwargs: Dict[str, Any] = {
+ "name": name,
+ "export": export,
+ "input_data": input_data,
+ "requires": requires,
+ "nexts": None,
+ "output_data": None,
+ }
+
+ locs = locals()
+ for key in (
+ "desc",
+ "envs",
+ "envs_depth",
+ "cache",
+ "forks",
+ "order",
+ "plugin_opts",
+ "scheduler",
+ "scheduler_opts",
+ "error_strategy",
+ "num_retries",
+ "submission_batch",
+ ):
+ if locs[key] is not None:
+ kwargs[key] = locs[key]
+
+ kwargs["__doc__"] = proc.__doc__
+ out = type(name, (proc,), kwargs)
+ return out
+
+ def __init_subclass__(cls) -> None:DOCS
+ """Do the requirements inferring since we need them to build up the
+ process relationship
+ """
+ base = [
+ mro
+ for mro in cls.__mro__
+ if issubclass(mro, Proc) and mro is not Proc and mro is not cls
+ ]
+ parent = base[0] if base else None
+ # cls.requires = cls._compute_requires()
+ # triggers cls.__setattr__() to compute requires
+ cls.nexts = []
+ cls.requires = cls.requires
+
+ if cls.name is None or (parent and cls.name == parent.name):
+ cls.name = cls.__name__
+
+ if not is_valid_name(cls.name):
+ raise PipenOrProcNameError(
+ f"{cls.name} is not a valid process name, expecting "
+ r"'^[\w.-]+$'"
+ )
+
+ envs = update_dict(
+ parent.envs if parent else None,
+ cls.envs,
+ depth=0 if not parent or parent.envs_depth is None else parent.envs_depth,
+ )
+ # So values can be accessed like Proc.envs.a.b
+ cls.envs = envs if isinstance(envs, Diot) else Diot(envs or {})
+ cls.plugin_opts = update_dict(
+ parent.plugin_opts if parent else None,
+ cls.plugin_opts,
+ )
+ cls.scheduler_opts = update_dict(
+ parent.scheduler_opts if parent else {},
+ cls.scheduler_opts,
+ )
+ cls.__meta__ = {"procgroup": None}
+
+ def __init__(self, pipeline: Pipen = None) -> None:
+ """Constructor
+
+ This is called only at runtime.
+
+ Args:
+ pipeline: The Pipen object
+ """
+ # instance properties
+ self.pipeline = pipeline
+
+ self.pbar = None
+ self.jobs: List[Any] = []
+ self.xqute = None
+ self.__class__.workdir = Path(self.pipeline.workdir) / self.name
+ # plugins can modify some default attributes
+ plugin.hooks.on_proc_create(self)
+
+ # Compute the properties
+ # otherwise, the property can be accessed directly from class vars
+ if self.desc is None:
+ self.desc: str = desc_from_docstring(self.__class__, Proc)
+
+ if self.export is None:
+ self.export = bool(not self.nexts)
+
+ # log the basic information
+ self._log_info()
+
+ # template
+ self.template = get_template_engine(
+ self.template or self.pipeline.config.template
+ )
+ template_opts = copy_dict(self.pipeline.config.template_opts)
+ template_opts.update(self.template_opts or {})
+ self.template_opts = template_opts
+
+ plugin_opts = copy_dict(self.pipeline.config.plugin_opts)
+ plugin_opts.update(self.plugin_opts or {})
+ self.plugin_opts = plugin_opts
+
+ # input
+ self.input = self._compute_input() # type: ignore
+ plugin.hooks.on_proc_input_computed(self)
+ # output
+ self.output = self._compute_output()
+ # scheduler
+ self.scheduler = get_scheduler( # type: ignore
+ self.scheduler or self.pipeline.config.scheduler
+ )
+ # script
+ self.script = self._compute_script() # type: ignore
+ self.workdir.mkdir(exist_ok=True)
+
+ if self.submission_batch is None:
+ self.submission_batch = self.pipeline.config.submission_batch
+
+ async def init(self) -> None:DOCS
+ """Init all other properties and jobs"""
+ import pandas
+
+ scheduler_opts = (
+ copy_dict(self.pipeline.config.scheduler_opts, 2) or {}
+ )
+ scheduler_opts.update(self.scheduler_opts or {})
+ self.xqute = Xqute(
+ self.scheduler,
+ job_metadir=self.workdir,
+ job_submission_batch=self.submission_batch,
+ job_error_strategy=self.error_strategy
+ or self.pipeline.config.error_strategy,
+ job_num_retries=self.pipeline.config.num_retries
+ if self.num_retries is None
+ else self.num_retries,
+ scheduler_forks=self.forks or self.pipeline.config.forks,
+ scheduler_jobprefix=self.name,
+ **scheduler_opts,
+ )
+ # for the plugin hooks to access
+ self.xqute.proc = self
+
+ await plugin.hooks.on_proc_init(self)
+ await self._init_jobs()
+ self.__class__.output_data = pandas.DataFrame(
+ (job.output for job in self.jobs)
+ )
+
+ def gc(self):DOCS
+ """GC process for the process to save memory after it's done"""
+ del self.xqute
+ self.xqute = None
+
+ del self.jobs[:]
+ self.jobs = []
+
+ del self.pbar
+ self.pbar = None
+
+ def log(DOCS
+ self,
+ level: int | str,
+ msg: str,
+ *args,
+ logger: logging.LoggerAdapter = logger,
+ ) -> None:
+ """Log message for the process
+
+ Args:
+ level: The log level of the record
+ msg: The message to log
+ *args: The arguments to format the message
+ logger: The logging logger
+ """
+ msg = msg % args
+ if not isinstance(level, int):
+ level = logging.getLevelName(level.upper())
+ logger.log(
+ level, # type: ignore
+ "[cyan]%s:[/cyan] %s",
+ self.name,
+ msg,
+ )
+
+ async def run(self) -> None:DOCS
+ """Run the process"""
+ # init pbar
+ self.pbar = self.pipeline.pbar.proc_bar(self.size, self.name)
+
+ await plugin.hooks.on_proc_start(self)
+
+ cached_jobs = []
+ for job in self.jobs:
+ if await job.cached:
+ cached_jobs.append(job.index)
+ await plugin.hooks.on_job_cached(job)
+ else:
+ await self.xqute.put(job)
+ if cached_jobs:
+ self.log("info", "Cached jobs: [%s]", brief_list(cached_jobs))
+ await self.xqute.run_until_complete()
+ self.pbar.done()
+ await plugin.hooks.on_proc_done(
+ self,
+ False
+ if not self.succeeded
+ else "cached"
+ if len(cached_jobs) == self.size
+ else True,
+ )
+
+ # properties
+ @cached_property
+ def size(self) -> int:
+ """The size of the process (# of jobs)"""
+ return len(self.jobs)
+
+ @cached_property
+ def succeeded(self) -> bool:
+ """Check if the process is succeeded (all jobs succeeded)"""
+ return all(job.status == JobStatus.FINISHED for job in self.jobs)
+
+ # Private methods
+ @classmethod
+ def _compute_requires(
+ cls,
+ requires: Type[Proc] | Sequence[Type[Proc]] = None,
+ ) -> Sequence[Type[Proc]]:
+ """Compute the required processes and fill the nexts
+
+ Args:
+ requires: The required processes. If None, will use `cls.requires`
+
+ Returns:
+ None or sequence of Proc subclasses
+ """
+ if requires is None:
+ requires = cls.requires
+
+ if requires is None:
+ return requires
+
+ if is_subclass(requires, Proc):
+ requires = [requires] # type: ignore
+
+ # if req is in cls.__bases__, then cls.nexts will be affected by
+ # req.nexts
+ my_nexts = None if cls.nexts is None else cls.nexts[:]
+ for req in requires: # type: ignore
+ if not req.nexts:
+ req.nexts = [cls]
+ else:
+ req.nexts.append(cls) # type: ignore
+ cls.nexts = my_nexts
+
+ return requires # type: ignore
+
+ async def _init_job(self, worker_id: int) -> None:
+ """A worker to initialize jobs
+
+ Args:
+ worker_id: The worker id
+ """
+ for job in self.jobs:
+ if job.index % self.submission_batch != worker_id:
+ continue
+ await job.prepare(self)
+
+ async def _init_jobs(self) -> None:
+ """Initialize all jobs
+
+ Args:
+ config: The pipeline configuration
+ """
+
+ for i in range(self.input.data.shape[0]):
+ job = self.scheduler.job_class(i, "", self.workdir)
+ self.jobs.append(job)
+
+ await asyncio.gather(
+ *(self._init_job(i) for i in range(self.submission_batch))
+ )
+
+ def _compute_input(self) -> Mapping[str, Mapping[str, Any]]:
+ """Calculate the input based on input and input data
+
+ Returns:
+ A dict with type and data
+ """
+ import pandas
+ from .channel import Channel
+
+ # split input keys into keys and types
+ input_keys = self.input
+ if input_keys and isinstance(input_keys, str):
+ input_keys = strsplit(input_keys, ",")
+
+ if not input_keys:
+ raise ProcInputKeyError(f"[{self.name}] No input provided")
+
+ out = Diot(type={}, data=None)
+ for input_key_type in input_keys:
+ if ":" not in input_key_type:
+ out.type[input_key_type] = ProcInputType.VAR
+ continue
+
+ input_key, input_type = strsplit(input_key_type, ":", 1)
+ if input_type not in ProcInputType.__dict__.values():
+ raise ProcInputTypeError(
+ f"[{self.name}] Unsupported input type: {input_type}"
+ )
+ out.type[input_key] = input_type
+
+ # get the data
+ if not self.requires and self.input_data is None:
+ out.data = pandas.DataFrame([[None] * len(out.type)])
+ elif not self.requires:
+ out.data = Channel.create(self.input_data)
+ elif callable(self.input_data):
+ out.data = Channel.create(
+ self.__class__.input_data(
+ *(req.output_data for req in self.requires) # type: ignore
+ )
+ )
+ else:
+ if self.input_data:
+ self.log(
+ "warning",
+ "Ignoring input data, this is not a start process.",
+ )
+
+ out.data = pandas.concat(
+ (req.output_data for req in self.requires), # type: ignore
+ axis=1,
+ ).ffill()
+
+ make_df_colnames_unique_inplace(out.data)
+
+ # try match the column names
+ # if none matched, use the first columns
+ # rest_cols = out.data.columns.difference(out.type, False)
+ rest_cols = [col for col in out.data.columns if col not in out.type]
+ len_rest_cols = len(rest_cols)
+ # matched_cols = out.data.columns.intersection(out.type)
+ matched_cols = [col for col in out.data.columns if col in out.type]
+ needed_cols = [col for col in out.type if col not in matched_cols]
+ len_needed_cols = len(needed_cols)
+
+ if len_rest_cols > len_needed_cols:
+ self.log(
+ "warning",
+ "Wasted %s column(s) of input data.",
+ len_rest_cols - len_needed_cols,
+ )
+ elif len_rest_cols < len_needed_cols:
+ self.log(
+ "warning",
+ "No data column for input: %s, using None.",
+ needed_cols[len_rest_cols:],
+ )
+ # Add None
+ # Use loop to keep order
+ for needed_col in needed_cols[len_rest_cols:]:
+ out.data.insert(out.data.shape[1], needed_col, None)
+ len_needed_cols = len_rest_cols
+
+ out.data = out.data.rename(
+ columns=dict(zip(rest_cols[:len_needed_cols], needed_cols))
+ ).loc[:, list(out.type)]
+
+ return out
+
+ def _compute_output(self) -> str | List[str]:
+ """Compute the output for jobs to render"""
+ if not self.output:
+ return None
+
+ if isinstance(self.output, (list, tuple)):
+ return [
+ self.template(oput, **self.template_opts) # type: ignore
+ for oput in self.output
+ ]
+
+ return self.template(self.output, **self.template_opts) # type: ignore
+
+ def _compute_script(self) -> Template:
+ """Compute the script for jobs to render"""
+ if not self.script:
+ self.log("warning", "No script specified.")
+ return None
+
+ script = self.script
+ if script.startswith("file://"):
+ script_file = Path(script[7:])
+ if not script_file.is_absolute():
+ base = get_base(
+ self.__class__,
+ Proc,
+ script,
+ lambda klass: getattr(klass, "script", None),
+ )
+ script_file = Path(inspect.getfile(base)).parent / script_file
+ if not script_file.is_file():
+ raise ProcScriptFileNotFound(
+ f"No such script file: {script_file}"
+ )
+ script = script_file.read_text()
+
+ self.script = ignore_firstline_dedent(script)
+ if not self.lang:
+ self.lang = get_shebang(self.script)
+
+ plugin.hooks.on_proc_script_computed(self)
+ return self.template(self.script, **self.template_opts) # type: ignore
+
+ def _log_info(self):
+ """Log some basic information of the process"""
+ title = (
+ f"{self.__meta__['procgroup'].name}/{self.name}"
+ if self.__meta__["procgroup"]
+ else self.name
+ )
+ panel = Panel(
+ self.desc or "Undescribed",
+ title=title,
+ box=box.Box(
+ "╭═┬╮\n"
+ "║ ║║\n"
+ "├═┼┤\n"
+ "║ ║║\n"
+ "├═┼┤\n"
+ "├═┼┤\n"
+ "║ ║║\n"
+ "╰═┴╯\n"
+ )
+ if self.export
+ else box.ROUNDED,
+ width=get_logpanel_width(),
+ )
+
+ logger.info("")
+ log_rich_renderable(panel, "cyan", logger.info)
+ self.log("info", "Workdir: %r", str(self.workdir))
+ self.log(
+ "info",
+ "[yellow]<<<[/yellow] %s",
+ [proc.name for proc in self.requires]
+ if self.requires
+ else "[START]",
+ )
+ self.log(
+ "info",
+ "[yellow]>>>[/yellow] %s",
+ [proc.name for proc in self.nexts] if self.nexts else "[END]",
+ )
+
"""Process group that contains a set of processes.
+
+It can be easily used to create a pipeline that runs independently or
+integrated into a larger pipeline.
+
+Runs directly:
+>>> proc_group = ProcGroup(<options>)
+>>> proc_group.as_pipen(<pipeline options>).set_data(<data>).run()
+
+Integrated into a larger pipeline
+>>> proc_group = ProcGroup(<options>)
+>>> # proc could be a process within the larger pipeline
+>>> proc.requires = prog_group.<proc>
+
+To add a process to the proc group, use the `add_proc` method:
+>>> class MyProcGroup(ProcGroup):
+>>> ...
+>>>
+>>> proc_group = MyProcGroup(...)
+>>> @proc_group.add_proc
+>>> class MyProc(Proc):
+>>> ...
+
+Or add a process at runtime:
+>>> class MyProcGroup(ProcGroup):
+>>> ...
+>>>
+>>> @ProcGroup.add_proc
+>>> def my_proc(self):
+>>> class MyProc(Proc):
+>>> # You may use self.options here
+>>> ...
+>>> return MyProc
+>>> proc_group = MyProcGroup(...)
+"""
+from __future__ import annotations
+
+from os import PathLike
+from functools import wraps, cached_property
+from typing import Any, Callable, Mapping, Type, List
+from abc import ABC, ABCMeta
+from diot import Diot
+
+from .pipen import Pipen
+from .proc import Proc
+
+
+class ProcGropuMeta(ABCMeta):DOCS
+ """Meta class for ProcGroup"""
+
+ _INST = None
+
+ def __call__(cls, *args, **kwds):DOCS
+ """Make sure Proc subclasses are singletons
+
+ Args:
+ *args: and
+ **kwds: Arguments for the constructor
+
+ Returns:
+ The Proc instance
+ """
+ if cls._INST is None:
+ cls._INST = super().__call__(*args, **kwds)
+
+ return cls._INST
+
+
+class ProcGroup(ABC, metaclass=ProcGropuMeta):DOCS
+ """A group of processes that can be run independently or
+ integrated into a larger pipeline.
+ """
+
+ name: str | None = None
+ __meta__: Mapping[str, Any] = {}
+ DEFAULTS = Diot()
+ PRESERVED = {
+ "opts",
+ "name",
+ "add_proc",
+ "as_pipen",
+ "procs",
+ "starts",
+ "DEFAULTS",
+ "PRESERVED",
+ "_INST",
+ }
+
+ def __init_subclass__(cls) -> None:DOCS
+ # Clear the meta
+ cls.__meta__ = {}
+
+ def __init__(self, **opts) -> None:
+ self.opts = Diot(self.__class__.DEFAULTS or {}) | (opts or {})
+ self.name = self.__class__.name or self.__class__.__name__
+ self.starts: List[Type[Proc]] = []
+ self.procs = Diot()
+
+ self._load_runtime_procs()
+
+ def _load_runtime_procs(self):
+ """Load all processes that are added at runtime"""
+ # Load all processes if they are decorated by ProcGroup.add_proc
+ for name, attr in self.__class__.__dict__.items():
+ if isinstance(attr, cached_property):
+ getattr(self, name)
+ elif isinstance(attr, type) and issubclass(attr, Proc):
+ self.add_proc(attr)
+
+ def add_proc(DOCS
+ self_or_method: ProcGroup | Callable[[ProcGroup], Type[Proc]],
+ proc: Type[Proc] | None = None,
+ ) -> Type[Proc] | cached_property:
+ """Add a process to the proc group
+
+ It works either as a decorator to the process directly or as a
+ decorator to a method that returns the process.
+
+ Args:
+ self_or_method: The proc group instance or a method that
+ returns the process
+ proc: The process class if `self_or_method` is the proc group
+
+ Returns:
+ The process class if `self_or_method` is the proc group, or
+ a cached property that returns the process class
+ """
+ if isinstance(self_or_method, ProcGroup):
+ # Called as self.add_proc or pg.add_proc
+ if proc is None:
+ return self_or_method.add_proc # type: ignore
+
+ if proc.name in self_or_method.__class__.PRESERVED:
+ raise ValueError(
+ f"Process name `{proc.name}` is reserved for ProcGroup"
+ )
+
+ setattr(self_or_method, proc.name, proc)
+ proc.__meta__["procgroup"] = self_or_method # type: ignore
+ if not proc.requires:
+ self_or_method.starts.append(proc)
+ self_or_method.procs[proc.name] = proc
+ return proc
+
+ @wraps(self_or_method)
+ def wrapper(self):
+ proc = self_or_method(self)
+
+ if proc is None:
+ return None
+
+ if (not isinstance(proc, type) or not issubclass(proc, Proc)):
+ raise ValueError(f"`{proc}` is not a Proc subclass")
+
+ proc.__meta__["procgroup"] = self
+ if not proc.requires:
+ self.starts.append(proc)
+ self.procs[proc.name] = proc
+ return proc
+
+ return cached_property(wrapper)
+
+ def as_pipen(DOCS
+ self,
+ name: str | None = None,
+ desc: str | None = None,
+ outdir: str | PathLike | None = None,
+ **kwargs,
+ ) -> Pipen:
+ """Convert the pipeline to a Pipen instance
+
+ Args:
+ name: The name of the pipeline
+ desc: The description of the pipeline
+ outdir: The output directory of the pipeline
+ **kwargs: The keyword arguments to pass to Pipen
+
+ Returns:
+ The Pipen instance
+ """
+ name = name or self.__class__.__name__
+ if self.__doc__:
+ desc = desc or self.__doc__.lstrip().splitlines()[0]
+
+ pipe = Pipen(name=name, desc=desc, outdir=outdir, **kwargs)
+ pipe.set_start(self.starts)
+ return pipe
+
"""Provide the PipelinePBar and ProcPBar classes"""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .utils import truncate_text
+
+if TYPE_CHECKING: # pragma: no cover
+ import enlighten
+
+# [12/02/20 12:44:06] I core
+# pipeline: 100%|
+# | desc_len |
+PBAR_DESC_LEN = 24
+
+
+class ProcPBar:DOCS
+ """The progress bar for processes"""
+
+ def __init__(
+ self, manager: enlighten.Manager, proc_size: int, proc_name: str
+ ) -> None:
+ self.submitted_counter = manager.counter(
+ total=proc_size,
+ color="cyan",
+ desc=proc_name,
+ unit="jobs",
+ leave=False,
+ )
+ self.running_counter = self.submitted_counter.add_subcounter("yellow")
+ self.success_counter = self.submitted_counter.add_subcounter("green")
+ self.failure_counter = self.submitted_counter.add_subcounter("red")
+
+ def update_job_submitted(self):DOCS
+ """Update the progress bar when a job is submitted"""
+ self.submitted_counter.update()
+
+ def update_job_retrying(self):DOCS
+ """Update the progress bar when a job is retrying"""
+ # self.running_counter.count -= 1
+ self.failure_counter.update(-1)
+
+ def update_job_running(self):DOCS
+ """Update the progress bar when a job is running"""
+ try:
+ self.running_counter.update_from(self.submitted_counter)
+ except ValueError: # pragma: no cover
+ pass
+
+ def update_job_succeeded(self):DOCS
+ """Update the progress bar when a job is succeeded"""
+ try:
+ self.success_counter.update_from(self.running_counter)
+ except ValueError: # pragma: no cover
+ try:
+ self.success_counter.update_from(self.submitted_counter)
+ except ValueError: # pragma: no cover
+ pass
+ except: # noqa: E722 # pragma: no cover
+ pass
+
+ def update_job_failed(self):DOCS
+ """Update the progress bar when a job is failed"""
+ try:
+ self.failure_counter.update_from(self.running_counter)
+ except ValueError: # pragma: no cover
+ try:
+ self.failure_counter.update_from(self.submitted_counter)
+ except ValueError: # pragma: no cover
+ pass
+ except: # noqa: E722 # pragma: no cover
+ pass
+
+ def done(self):DOCS
+ """The process is done"""
+ self.submitted_counter.close()
+
+
+class PipelinePBar:DOCS
+ """Progress bar for the pipeline"""
+
+ def __init__(self, n_procs: int, ppln_name: str) -> None:
+ """Initialize progress bar for pipeline"""
+ import enlighten
+
+ desc_len = PBAR_DESC_LEN
+ ppln_name = truncate_text(ppln_name, desc_len)
+ self.manager = enlighten.get_manager()
+ self.running_counter = self.manager.counter(
+ total=n_procs,
+ color="yellow",
+ desc=f"{ppln_name:>{desc_len}}:",
+ unit="procs",
+ )
+ self.success_counter = self.running_counter.add_subcounter("green")
+ self.failure_counter = self.running_counter.add_subcounter("red")
+ self.desc_len = desc_len
+
+ def proc_bar(self, proc_size: int, proc_name: str) -> ProcPBar:DOCS
+ """Get the progress bar for a process
+
+ Args:
+ proc_size: The size of the process
+ proc_name: The name of the process
+
+ Returns:
+ The progress bar for the given process
+ """
+ proc_name = truncate_text(proc_name, self.desc_len)
+ proc_name = f"{proc_name:>{self.desc_len}}:"
+ return ProcPBar(self.manager, proc_size, proc_name)
+
+ def update_proc_running(self):DOCS
+ """Update the progress bar when a process is running"""
+ self.running_counter.update()
+
+ def update_proc_done(self):DOCS
+ """Update the progress bar when a process is done"""
+ self.success_counter.update_from(self.running_counter)
+
+ def update_proc_error(self):DOCS
+ """Update the progress bar when a process is errored"""
+ self.failure_counter.update_from(self.running_counter)
+
+ def done(self) -> None:DOCS
+ """When the pipeline is done"""
+ self.running_counter.close()
+ self.manager.stop()
+
"""Provide builting schedulers"""
+from __future__ import annotations
+
+from typing import Type
+
+from xqute import Scheduler
+from xqute.schedulers.local_scheduler import (
+ LocalJob as XquteLocalJob,
+ LocalScheduler as XquteLocalScheduler,
+)
+from xqute.schedulers.sge_scheduler import (
+ SgeJob as XquteSgeJob,
+ SgeScheduler as XquteSgeScheduler
+)
+from xqute.schedulers.slurm_scheduler import (
+ SlurmJob as XquteSlurmJob,
+ SlurmScheduler as XquteSlurmScheduler,
+)
+from xqute.schedulers.ssh_scheduler import (
+ SshJob as XquteSshJob,
+ SshScheduler as XquteSshScheduler,
+)
+
+from .defaults import SCHEDULER_ENTRY_GROUP
+from .exceptions import NoSuchSchedulerError, WrongSchedulerTypeError
+from .job import Job
+from .utils import is_subclass, load_entrypoints
+
+
+class LocalJob(XquteLocalJob, Job):DOCS
+ """Job class for local scheduler"""
+
+
+class LocalScheduler(XquteLocalScheduler):DOCS
+ """Local scheduler"""
+ job_class = LocalJob
+
+
+class SgeJob(XquteSgeJob, Job):DOCS
+ """Job class for SGE scheduler"""
+
+
+class SgeScheduler(XquteSgeScheduler):DOCS
+ """SGE scheduler"""
+ job_class = SgeJob
+
+
+class SlurmJob(XquteSlurmJob, Job):DOCS
+ """Job class for Slurm scheduler"""
+
+
+class SlurmScheduler(XquteSlurmScheduler):DOCS
+ """Slurm scheduler"""
+ job_class = SlurmJob
+
+
+class SshJob(XquteSshJob, Job):DOCS
+ """Job class for SSH scheduler"""
+
+
+class SshScheduler(XquteSshScheduler):DOCS
+ """SSH scheduler"""
+ job_class = SshJob
+
+
+def get_scheduler(scheduler: str | Type[Scheduler]) -> Type[Scheduler]:DOCS
+ """Get the scheduler by name of the scheduler class itself
+
+ Args:
+ scheduler: The scheduler class or name
+
+ Returns:
+ The scheduler class
+ """
+ if is_subclass(scheduler, Scheduler):
+ return scheduler # type: ignore
+
+ if scheduler == "local":
+ return LocalScheduler
+
+ if scheduler == "sge":
+ return SgeScheduler
+
+ if scheduler == "slurm":
+ return SlurmScheduler
+
+ if scheduler == "ssh":
+ return SshScheduler
+
+ for n, obj in load_entrypoints(SCHEDULER_ENTRY_GROUP): # pragma: no cover
+ if n == scheduler:
+ if not is_subclass(obj, Scheduler):
+ raise WrongSchedulerTypeError(
+ "Scheduler should be a subclass of "
+ "pipen.scheduler.Scheduler."
+ )
+ return obj
+
+ raise NoSuchSchedulerError(str(scheduler))
+
"""Template adaptor for pipen"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any, Mapping, Type
+
+from liquid import Liquid
+
+from .defaults import TEMPLATE_ENTRY_GROUP
+from .exceptions import NoSuchTemplateEngineError, WrongTemplateEnginTypeError
+from .utils import is_subclass, load_entrypoints
+
+__all__ = [
+ "Template",
+ "TemplateLiquid",
+ "TemplateJinja2",
+ "get_template_engine",
+]
+
+
+class Template(ABC):DOCS
+ """Base class wrapper to wrap template for pipen"""
+
+ def __init__(
+ self,
+ source: Any,
+ **kwargs: Any,
+ ):
+ """Template construct"""
+ self.engine: Any = None
+
+ def render(self, data: Mapping[str, Any] = None) -> str:DOCS
+ """
+ Render the template
+ @parmas:
+ data (dict): The data used to render
+ """
+ return self._render(data or {})
+
+ @abstractmethod
+ def _render(self, data: Mapping[str, Any]) -> str:
+ """Implement rendering"""
+
+
+class TemplateLiquid(Template):DOCS
+ """Liquidpy template wrapper."""
+
+ name = "liquid"
+
+ def __init__(
+ self,
+ source: Any,
+ **kwargs: Any,
+ ):
+ """Initiate the engine with source and envs
+
+ Args:
+ source: The souce text
+ envs: The env data
+ **kwargs: Other arguments for Liquid
+ """
+ super().__init__(source)
+ self.engine = Liquid(
+ source,
+ from_file=False,
+ mode="wild",
+ **kwargs,
+ )
+
+ def _render(self, data: Mapping[str, Any]) -> str:
+ """Render the template
+
+ Args:
+ data: The data used for rendering
+
+ Returns
+ The rendered string
+ """
+ return self.engine.render(data)
+
+
+class TemplateJinja2(Template):DOCS
+ """Jinja2 template wrapper"""
+
+ name = "jinja2"
+
+ def __init__(
+ self,
+ source: Any,
+ **kwargs: Any,
+ ):
+ """Initiate the engine with source and envs
+
+ Args:
+ source: The souce text
+ envs: The env data
+ **kwargs: Other arguments for jinja2.Template
+ """
+ import jinja2
+
+ super().__init__(source)
+ filters = kwargs.pop("filters", {})
+ envs = kwargs.pop("globals", {})
+ filters = kwargs.pop("filters", {})
+ self.engine = jinja2.Template(source, **kwargs)
+ self.engine.globals.update(envs)
+ self.engine.environment.filters.update(filters)
+
+ def _render(self, data: Mapping[str, Any]) -> str:
+ """Render the template
+
+ Args:
+ data: The data used for rendering
+
+ Retuens:
+ The rendered string
+ """
+ return self.engine.render(data)
+
+
+def get_template_engine(template: str | Type[Template]) -> Type[Template]:DOCS
+ """Get the template engine by name or the template engine itself
+
+ Args:
+ template: The name of the template engine or the template engine itself
+
+ Returns:
+ The template engine
+ """
+ if is_subclass(template, Template):
+ return template # type: ignore
+
+ if template == "liquid":
+ return TemplateLiquid
+
+ if template == "jinja2":
+ return TemplateJinja2
+
+ for name, obj in load_entrypoints(
+ TEMPLATE_ENTRY_GROUP
+ ): # pragma: no cover
+ if name == template:
+ if not is_subclass(obj, Template):
+ raise WrongTemplateEnginTypeError(
+ "Template engine should be a subclass of "
+ "pipen.templates.Template."
+ )
+ return obj
+
+ raise NoSuchTemplateEngineError(str(template))
+
"""Provide some utilities"""
+from __future__ import annotations
+
+import re
+import sys
+import importlib
+import importlib.util
+import logging
+import textwrap
+import typing
+from itertools import groupby
+from operator import itemgetter
+from io import StringIO
+from os import PathLike, get_terminal_size, environ
+from collections import defaultdict
+from pathlib import Path
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ DefaultDict,
+ Iterable,
+ List,
+ Mapping,
+ Sequence,
+ Tuple,
+ Type,
+)
+
+import diot
+import simplug
+from rich.console import Console
+from rich.logging import RichHandler as _RichHandler
+from rich.table import Table
+from rich.text import Text
+from simplug import SimplugContext
+
+from .defaults import (
+ CONSOLE_DEFAULT_WIDTH,
+ CONSOLE_WIDTH_WITH_PANEL,
+ CONSOLE_WIDTH_SHIFT,
+ LOGGER_NAME,
+)
+from .version import __version__
+
+from importlib import metadata as importlib_metadata
+
+if TYPE_CHECKING: # pragma: no cover
+ import pandas
+ from rich.segment import Segment
+ from rich.console import RenderableType
+
+ from .pipen import Pipen
+ from .proc import Proc
+ from .procgroup import ProcGroup
+
+LOADING_ARGV0 = "@pipen"
+
+
+class RichHandler(_RichHandler):DOCS
+ """Subclass of rich.logging.RichHandler, showing log levels as a single
+ character"""
+
+ def get_level_text(self, record: logging.LogRecord) -> Text:DOCS
+ """Get the level name from the record.
+
+ Args:
+ record: LogRecord instance.
+
+ Returns:
+ Text: A tuple of the style and level name.
+ """
+ level_name = record.levelname
+ level_text = Text.styled(
+ level_name[0].upper(), f"logging.level.{level_name.lower()}"
+ )
+ return level_text
+
+
+class RichConsole(Console):DOCS
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ try:
+ self._width = get_terminal_size().columns
+ except (AttributeError, ValueError, OSError): # maybe not a terminal
+ if environ.get("JUPYTER_COLUMNS") is not None: # pragma: no cover
+ self._width = int(environ.get("JUPYTER_COLUMNS"))
+ elif environ.get("COLUMNS") is not None: # pragma: no cover
+ self._width = int(environ.get("COLUMNS"))
+ else:
+ self._width = CONSOLE_DEFAULT_WIDTH
+
+ def _render_buffer(self, buffer: Iterable[Segment]) -> str:
+ out = super()._render_buffer(buffer)
+ return out.rstrip() + "\n"
+
+
+logging.lastResort = logging.NullHandler() # type: ignore
+logger_console = RichConsole()
+_logger_handler = RichHandler(
+ show_path=False,
+ show_level=True,
+ console=logger_console,
+ rich_tracebacks=True,
+ omit_repeated_times=False, # rich 10+
+ markup=True,
+ log_time_format="%m-%d %H:%M:%S",
+ tracebacks_extra_lines=0,
+ tracebacks_suppress=[simplug, diot, typing],
+)
+_logger_handler.setFormatter(
+ logging.Formatter("[purple]%(plugin_name)-7s[/purple] %(message)s")
+)
+
+
+def _excepthook(
+ type_: Type[BaseException],
+ value: BaseException,
+ traceback: Any,
+) -> None:
+ """The excepthook for pipen, to show rich traceback"""
+ if issubclass(type_, KeyboardInterrupt): # pragma: no cover
+ logger.error("")
+ logger.error("Interrupted by user")
+ return
+
+ print("", file=sys.stderr)
+ _excepthook.oldhook(type_, value, traceback)
+
+
+_excepthook.oldhook = sys.excepthook
+sys.excepthook = _excepthook
+
+
+def get_logger(DOCS
+ name: str = LOGGER_NAME,
+ level: str | int = None,
+) -> logging.LoggerAdapter:
+ """Get the logger by given plugin name
+
+ Args:
+ level: The initial level of the logger
+
+ Returns:
+ The logger
+ """
+ log = logging.getLogger(f"pipen.{name}")
+ log.addHandler(_logger_handler)
+
+ if level is not None:
+ log.setLevel(level.upper() if isinstance(level, str) else level)
+
+ return logging.LoggerAdapter(log, {"plugin_name": name})
+
+
+logger = get_logger()
+
+
+def desc_from_docstring(DOCS
+ obj: Type[Pipen | Proc],
+ base: Type[Pipen | Proc],
+) -> str:
+ """Get the description from docstring
+
+ Only extract the summary.
+
+ Args:
+ obj: The object with docstring
+
+ Returns:
+ The summary as desc
+ """
+ if not obj.__doc__:
+ # If the docstring is empty, use the base's docstring
+ # Get the base from mro
+ bases = [
+ cls
+ for cls in obj.__mro__
+ if is_subclass(cls, base) and cls != base and cls != obj
+ ]
+ if not bases:
+ return None
+
+ return desc_from_docstring(bases[0], base)
+
+ started: bool = False
+ out: List[str] = []
+ for line in obj.__doc__.splitlines():
+ line = line.strip()
+ if not started and not line:
+ continue
+ if not started:
+ out.append(line)
+ started = True
+ elif line:
+ out.append(line)
+ else:
+ break
+
+ return " ".join(out)
+
+
+def update_dict(DOCS
+ parent: Mapping[str, Any],
+ new: Mapping[str, Any],
+ depth: int = 0,
+) -> Mapping[str, Any]:
+ """Update the new dict to the parent, but make sure parent does not change
+
+ Args:
+ parent: The parent dictionary
+ new: The new dictionary
+ depth: The depth to be copied. 0 for updating to the deepest level.
+
+ Examples:
+ >>> parent = {"a": {"b": 1}}
+ >>> new = {"a": {"c": 2}}
+ >>> update_dict(parent, new)
+ >>> # {"a": {"b": 1, "c": 2}}
+
+ Returns:
+ The updated dictionary or None if both parent and new are None.
+ """
+ if parent is None and new is None:
+ return None
+
+ out = (parent or {}).copy()
+ for key, val in (new or {}).items():
+ if (
+ key not in out
+ or not isinstance(val, dict)
+ or not isinstance(out[key], dict)
+ or depth == 1
+ ):
+ out[key] = val
+ else:
+ out[key] = update_dict(out[key], val, depth - 1)
+
+ return out
+
+
+def strsplit(DOCS
+ string: str,
+ sep: str,
+ maxsplit: int = -1,
+ trim: str = "both",
+) -> List[str]:
+ """Split the string, with the ability to trim each part."""
+ parts = string.split(sep, maxsplit=maxsplit)
+ if trim is None:
+ return parts
+ if trim == "left":
+ return [part.lstrip() for part in parts]
+ if trim == "right":
+ return [part.rstrip() for part in parts]
+
+ return [part.strip() for part in parts]
+
+
+def get_shebang(script: str) -> str:DOCS
+ """Get the shebang of the script
+
+ Args:
+ script: The script string
+
+ Returns:
+ None if the script does not contain a shebang, otherwise the shebang
+ without `#!` prefix
+ """
+ script = script.lstrip()
+ if not script.startswith("#!"):
+ return None
+
+ if "\n" not in script:
+ return script[2:].strip()
+
+ shebang_line, _ = strsplit(script, "\n", 1)
+ return shebang_line[2:].strip()
+
+
+def ignore_firstline_dedent(text: str) -> str:DOCS
+ """Like textwrap.dedent(), but ignore first empty lines
+
+ Args:
+ text: The text the be dedented
+
+ Returns:
+ The dedented text
+ """
+ out = []
+ started = False
+ for line in text.splitlines():
+ if not started and not line.strip():
+ continue
+ if not started:
+ started = True
+ out.append(line)
+
+ return textwrap.dedent("\n".join(out))
+
+
+def copy_dict(dic: Mapping[str, Any], depth: int = 1) -> Mapping[str, Any]:DOCS
+ """Deep copy a dict
+
+ Args:
+ dic: The dict to be copied
+ depth: The depth to be deep copied
+
+ Returns:
+ The deep-copied dict
+ """
+ if depth <= 1:
+ return dic.copy()
+
+ return {
+ key: copy_dict(val, depth - 1) if isinstance(val, dict) else val
+ for key, val in dic.items()
+ }
+
+
+def get_logpanel_width() -> int:DOCS
+ """Get the width of the log content
+
+ Args:
+ max_width: The maximum width to return
+ Note that it's not the console width. With console width, you
+ have to subtract the width of the log meta info
+ (CONSOLE_WIDTH_SHIFT).
+
+ Returns:
+ The width of the log content
+ """
+ return (
+ min(
+ logger_console.width,
+ CONSOLE_WIDTH_WITH_PANEL,
+ )
+ - CONSOLE_WIDTH_SHIFT
+ )
+
+
+def log_rich_renderable(DOCS
+ renderable: RenderableType,
+ color: str | None,
+ logfunc: Callable,
+ *args: Any,
+ **kwargs: Any,
+) -> None:
+ """Log a rich renderable to logger
+
+ Args:
+ renderable: The rich renderable
+ splitline: Whether split the lines or log the entire message
+ logfunc: The log function, if message is not the first argument,
+ use functools.partial to wrap it
+ *args: The arguments to the log function
+ **kwargs: The keyword arguments to the log function
+ """
+ console = Console(
+ file=StringIO(),
+ width=logger_console.width - CONSOLE_WIDTH_SHIFT,
+ )
+ console.print(renderable)
+
+ for line in console.file.getvalue().splitlines():
+ logfunc(
+ f"[{color}]{line}[/{color}]" if color else line,
+ *args,
+ **kwargs,
+ )
+
+
+def brief_list(blist: List[int]) -> str:DOCS
+ """Briefly show an integer list, combine the continuous numbers.
+
+ Args:
+ blist: The list
+
+ Returns:
+ The string to show for the briefed list.
+ """
+ ret = []
+ for _, g in groupby(enumerate(blist), lambda x: x[0] - x[1]):
+ list_group = list(map(itemgetter(1), g))
+ if len(list_group) > 1:
+ ret.append(f"{list_group[0]}-{list_group[-1]}")
+ else:
+ ret.append(str(list_group[0]))
+ return ", ".join(ret)
+
+
+def pipen_banner() -> RenderableType:DOCS
+ """The banner for pipen
+
+ Returns:
+ The banner renderable
+ """
+ table = Table(
+ width=get_logpanel_width(),
+ show_header=False,
+ show_edge=False,
+ show_footer=False,
+ show_lines=False,
+ caption=f"version: {__version__}",
+ )
+ table.add_column(justify="center")
+ table.add_row(r" _____________________________________ __")
+ table.add_row(r" ___ __ \___ _/__ __ \__ ____/__ | / /")
+ table.add_row(r" __ /_/ /__ / __ /_/ /_ __/ __ |/ / ")
+ table.add_row(r" _ ____/__/ / _ ____/_ /___ _ /| / ")
+ table.add_row(r"/_/ /___/ /_/ /_____/ /_/ |_/ ")
+ table.add_row("")
+
+ return table
+
+
+def get_mtime(path: str | PathLike, dir_depth: int = 1) -> float:DOCS
+ """Get the modification time of a path.
+ If path is a directory, try to get the last modification time of the
+ contents in the directory at given dir_depth
+ Args:
+ dir_depth: The depth of the directory to check the
+ last modification time
+ Returns:
+ The last modification time of path
+ """
+ path = Path(path)
+ if not path.is_dir() or dir_depth == 0:
+ return path.lstat().st_mtime if path.is_symlink() else path.stat().st_mtime
+
+ mtime = 0.0
+ for file in path.glob("*"):
+ mtime = max(mtime, get_mtime(file, dir_depth - 1))
+ return mtime
+
+
+def is_subclass(obj: Any, cls: type) -> bool:DOCS
+ """Tell if obj is a subclass of cls
+ Differences with issubclass is that we don't raise Type error if obj
+ is not a class
+
+ Args:
+ obj: The object to check
+ cls: The class to check
+
+ Returns:
+ True if obj is a subclass of cls otherwise False
+ """
+ try:
+ return issubclass(obj, cls)
+ except TypeError:
+ return False
+
+
+def load_entrypoints(DOCS
+ group: str
+) -> Iterable[Tuple[str, Any]]: # pragma: no cover
+ """Load objects from setuptools entrypoints by given group name
+
+ Args:
+ group: The group name of the entrypoints
+
+ Returns:
+ An iterable of tuples with name and the loaded object
+ """
+ try:
+ eps = importlib_metadata.entry_points(group=group)
+ except TypeError:
+ eps = importlib_metadata.entry_points().get(group, []) # type: ignore
+
+ yield from ((ep.name, ep.load()) for ep in eps)
+
+
+def truncate_text(text: str, width: int, end: str = "…") -> str:DOCS
+ """Truncate a text not based on words/whitespaces
+ Otherwise, we could use textwrap.shorten.
+
+ Args:
+ text: The text to be truncated
+ width: The max width of the the truncated text
+ end: The end string of the truncated text
+ Returns:
+ The truncated text with end appended.
+ """
+ if len(text) <= width:
+ return text
+
+ return text[: (width - len(end))] + end
+
+
+def make_df_colnames_unique_inplace(thedf: pandas.DataFrame) -> None:DOCS
+ """Make the columns of a data frame unique
+
+ Args:
+ thedf: The data frame
+ """
+ col_counts: DefaultDict = defaultdict(lambda: 0)
+ new_cols = []
+ for col in thedf.columns:
+ if col_counts[col] == 0:
+ new_cols.append(col)
+ else:
+ new_cols.append(f"{col}_{col_counts[col]}")
+ col_counts[col] += 1
+ thedf.columns = new_cols
+
+
+def get_base(DOCS
+ klass: Type,
+ abc_base: Type,
+ value: Any,
+ value_getter: Callable,
+) -> Type:
+ """Get the base class where the value was first defined
+
+ Args:
+ klass: The class
+ abc_base: The very base class to check in __bases__
+ value: The value to check
+ value_getter: How to get the value from the class
+
+ Returns:
+ The base class
+ """
+ bases = [
+ base
+ for base in klass.__bases__
+ if issubclass(base, abc_base) and value_getter(base) == value
+ ]
+ if not bases:
+ return klass
+
+ return get_base(bases[0], abc_base, value, value_getter)
+
+
+def mark(**kwargs) -> Callable[[type], type]:DOCS
+ """Mark a class (e.g. Proc) with given kwargs as metadata
+
+ These marks will not be inherited by the subclasses if the class is
+ a subclass of `Proc` or `ProcGroup`.
+
+ Args:
+ **kwargs: The kwargs to mark the proc
+
+ Returns:
+ The decorator
+ """
+ def decorator(cls: type) -> type:
+ if not getattr(cls, "__meta__", None):
+ cls.__meta__ = {}
+
+ cls.__meta__.update(kwargs)
+ return cls
+
+ return decorator
+
+
+def get_marked(cls: type, mark_name: str, default: Any = None) -> Any:DOCS
+ """Get the marked value from a proc
+
+ Args:
+ cls: The proc
+ mark_name: The mark name
+ default: The default value if the mark is not found
+
+ Returns:
+ The marked value
+ """
+ if not getattr(cls, "__meta__", None):
+ return default
+
+ return cls.__meta__.get(mark_name, default)
+
+
+def is_valid_name(name: str) -> bool:DOCS
+ """Check if a name is valid for a proc or pipen
+
+ Args:
+ name: The name to check
+
+ Returns:
+ True if valid, otherwise False
+ """
+ return re.match(r"^[\w.-]+$", name) is not None
+
+
+def _get_obj_from_spec(spec: str) -> Any:
+ """Get the object from a spec like `<module[.submodule]>:name` or
+ `/path/to/script.py:name`
+
+ Args:
+ spec: The spec
+
+ Returns:
+ The object
+
+ Raises:
+ AttributeError: If name cannot be found in the module
+ """
+ modpath, sep, name = spec.rpartition(":")
+ if sep != ":":
+ raise ValueError(
+ f"Invalid specification: {spec}.\n"
+ "It must be in the format '<module[.submodule]>:name' or \n"
+ "'/path/to/spec.py:name'"
+ )
+
+ path = Path(modpath)
+ if path.is_file():
+ mspec = importlib.util.spec_from_file_location(path.stem, modpath)
+ module = importlib.util.module_from_spec(mspec)
+ mspec.loader.exec_module(module)
+ else:
+ module = importlib.import_module(modpath)
+
+ return getattr(module, name)
+
+
+async def load_pipeline(DOCS
+ obj: str | Type[Proc] | Type[ProcGroup] | Type[Pipen],
+ argv0: str | None = None,
+ argv1p: Sequence[str] | None = None,
+ **kwargs: Any,
+) -> Pipen:
+ """Load a pipeline from a Pipen, Proc or ProcGroup object
+
+ It does not only load the Pipen object or convert the Proc/ProcGroup
+ object to Pipen, but also build the process relationships. So that we
+ can access `pipeline.procs` and `requires/nexts` of each proc.
+
+ To avoid running the pipeline and notify the plugins that this is just
+ for loading the pipeline, `sys.argv[0]` is set to `@pipen`.
+
+ Args:
+ obj: The Pipen, Proc or ProcGroup object. It can also be a string in
+ the format of `part1:part2` to load the pipeline, where part1 is
+ a path to a python file or package directory, and part2 is the name
+ of the proc, procgroup or pipeline to load.
+ It should be able to be loaded by `getattr(module, part2)`, where
+ module is loaded from `part1`.
+ argv0: The value to replace sys.argv[0]. "@pipen" will be used
+ by default.
+ argv1p: The values to replace sys.argv[1:]. Do not replace by default.
+ kwargs: The kwargs to pass to the Pipen constructor
+
+ Returns:
+ The loaded Pipen object
+
+ Raises:
+ TypeError: If obj or loaded obj is not a Pipen, Proc or ProcGroup
+ object
+ """
+ from .pipen import Pipen
+ from .proc import Proc
+ from .procgroup import ProcGroup
+
+ old_argv = sys.argv
+ if argv0 is None:
+ # Set it at runtime to allow LOADING_ARGV0 to be monkey-patched
+ argv0 = LOADING_ARGV0
+ if argv1p is None:
+ # Set it at runtime to adopt sys.argv changes
+ argv1p = sys.argv[1:]
+ sys.argv = [argv0] + list(argv1p)
+
+ try:
+ if isinstance(obj, str):
+ obj = _get_obj_from_spec(obj)
+ if isinstance(obj, Pipen) or (
+ isinstance(obj, type) and issubclass(obj, (Pipen, Proc, ProcGroup))
+ ):
+ pass
+ else:
+ raise TypeError(
+ "Expected a Pipen, Proc, ProcGroup class, or a Pipen object, "
+ f"got {type(obj)}"
+ )
+
+ pipeline = obj
+ if isinstance(obj, type) and issubclass(obj, Proc):
+ kwargs.setdefault("name", f"{obj.name}Pipeline")
+ pipeline = Pipen(**kwargs).set_starts(obj)
+
+ elif isinstance(obj, type) and issubclass(obj, ProcGroup):
+ pipeline = obj().as_pipen(**kwargs) # type: ignore
+
+ elif isinstance(obj, type) and issubclass(obj, Pipen):
+ # Avoid "pipeline" to be used as pipeline name by varname
+ (pipeline, ) = (obj(**kwargs), ) # type: ignore
+
+ elif isinstance(obj, Pipen):
+ pipeline._kwargs.update(kwargs)
+
+ # Initialize the pipeline so that the arguments definied by
+ # other plugins (i.e. pipen-args) to take in place.
+ pipeline.workdir = Path(pipeline.config.workdir).joinpath(
+ kwargs.get("name", pipeline.name)
+ )
+ await pipeline._init()
+ pipeline.workdir.mkdir(parents=True, exist_ok=True)
+ pipeline.build_proc_relationships()
+ finally:
+ sys.argv = old_argv
+
+ return pipeline
+
+
+def is_loading_pipeline(*flags: str, argv: Sequence[str] | None = None) -> bool:DOCS
+ """Check if we are loading the pipeline. Works only when
+ `argv0` is "@pipen" while loading the pipeline.
+
+ Note if you are using this function at compile time, make
+ sure you load your pipeline using the string form (`part1:part2`)
+ See more with `load_pipline()`.
+
+ Args:
+ *flags: Additional flags to check in sys.argv (e.g. "-h", "--help")
+ to determine if we are loading the pipeline
+ argv: The arguments to check. sys.argv is used by default.
+ Note that the first argument should be included in the check.
+ You could typically pass `[sys.argv[0], *your_args]` to this if you want
+ to check if `sys.argv[0]` is "@pipen" or `your_args` contains some flags.
+
+ Returns:
+ True if we are loading the pipeline (argv[0] == "@pipen"),
+ otherwise False
+ """
+ if argv is None:
+ argv = sys.argv
+
+ if len(argv) > 0 and argv[0] == LOADING_ARGV0:
+ return True
+
+ if flags:
+ return any(flag in argv for flag in flags)
+
+ return False # pragma: no cover
+
"""Provide version of pipen"""
+
+__version__ = "0.15.6"
+
"""A pipeline framework for python"""
+from .pipen import Pipen, run
+from .proc import Proc
+from .procgroup import ProcGroup
+
+# Use from pipen.channel import Channel instead of
+# from pipen import Channel
+# This slows down import
+# from .channel import Channel
+from .pluginmgr import plugin
+from .version import __version__
+