# 导入包
import pandas as pd
import numpy as np
from faker import Faker
# 伪造数据用于示例
faker = Faker(locale='zh-CN')
Faker.seed(12)
raw_df = pd.DataFrame({
'name': [faker.name() for x in range(5)],
'gender': [faker.profile()['sex'] for x in range(5)],
'age': np.random.randint(18, 80, 5),
'date': [faker.date() for x in range(5)],
'amount': np.random.randint(10, 10000, 5),
'postcode': [faker.postcode() for x in range(5)],
'cnt': [np.NaN, 10, np.NaN, 20, np.NaN]
})
raw_df
name | gender | age | date | amount | postcode | cnt | |
---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN |
# 先查看下数据集基本信息
raw_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5 entries, 0 to 4 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 5 non-null object 1 gender 5 non-null object 2 age 5 non-null int32 3 date 5 non-null object 4 amount 5 non-null int32 5 postcode 5 non-null object 6 cnt 2 non-null float64 dtypes: float64(1), int32(2), object(4) memory usage: 368.0+ bytes
DataFrame.convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)
df = raw_df.copy()
df = df.convert_dtypes()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5 entries, 0 to 4 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 5 non-null string 1 gender 5 non-null string 2 age 5 non-null Int32 3 date 5 non-null string 4 amount 5 non-null Int32 5 postcode 5 non-null string 6 cnt 2 non-null Int64 dtypes: Int32(2), Int64(1), string(4) memory usage: 383.0 bytes
Series.astype(dtype, copy=True, errors='raise')
df = raw_df.copy()
df['is_male'] = df['gender'] == 'M'
df['is_male'] = df['is_male'].astype('int8') # 将布尔值True、False转成数值1、0
df = df.astype({
'name': 'string',
'gender': 'string',
'date': 'datetime64[ns]'
})
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5 entries, 0 to 4 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 5 non-null string 1 gender 5 non-null string 2 age 5 non-null int32 3 date 5 non-null datetime64[ns] 4 amount 5 non-null int32 5 postcode 5 non-null object 6 cnt 2 non-null float64 7 is_male 5 non-null int8 dtypes: datetime64[ns](1), float64(1), int32(2), int8(1), object(1), string(2) memory usage: 373.0+ bytes
DataFrame.select_dtypes(include=None, exclude=None)
df = raw_df.copy()
df.select_dtypes(include=np.number) # 选择所有数值列
df.select_dtypes(exclude='object') # 选择所有非object类型的列
age | amount | cnt | |
---|---|---|---|
0 | 18 | 1352 | NaN |
1 | 31 | 1152 | 10.0 |
2 | 62 | 280 | NaN |
3 | 34 | 4154 | 20.0 |
4 | 78 | 7501 | NaN |
pandas.to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', cache=True)
df = raw_df.copy()
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
df.dtypes
name object gender object age int32 date datetime64[ns] amount int32 postcode object cnt float64 dtype: object
pandas.to_numeric(arg, errors='raise', downcast=None)
df = raw_df.copy()
df['postcode'] = pd.to_numeric(df['postcode'])
df.dtypes
name object gender object age int32 date object amount int32 postcode int64 cnt float64 dtype: object
Series.to_period(freq=None, copy=True)
df = raw_df.copy()
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
df['week'] = df['date'].dt.to_period('W').dt.start_time
df['month'] = df['date'].dt.to_period('M')
df['month_start'] = df['date'].dt.to_period('M').dt.start_time
df['month_end'] = df['date'].dt.to_period('M').dt.end_time
df['year'] = df['date'].dt.to_period('Y')
df['year_start'] = df['date'].dt.to_period('Y').dt.start_time
df['year_end'] = df['date'].dt.to_period('Y').dt.end_time
df[['date', 'week', 'month', 'month_start', 'month_end', 'year', 'year_start', 'year_end']]
date | week | month | month_start | month_end | year | year_start | year_end | |
---|---|---|---|---|---|---|---|---|
0 | 2012-04-06 | 2012-04-02 | 2012-04 | 2012-04-01 | 2012-04-30 23:59:59.999999999 | 2012 | 2012-01-01 | 2012-12-31 23:59:59.999999999 |
1 | 2009-04-11 | 2009-04-06 | 2009-04 | 2009-04-01 | 2009-04-30 23:59:59.999999999 | 2009 | 2009-01-01 | 2009-12-31 23:59:59.999999999 |
2 | 1988-10-18 | 1988-10-17 | 1988-10 | 1988-10-01 | 1988-10-31 23:59:59.999999999 | 1988 | 1988-01-01 | 1988-12-31 23:59:59.999999999 |
3 | 1995-10-02 | 1995-10-02 | 1995-10 | 1995-10-01 | 1995-10-31 23:59:59.999999999 | 1995 | 1995-01-01 | 1995-12-31 23:59:59.999999999 |
4 | 1992-11-05 | 1992-11-02 | 1992-11 | 1992-11-01 | 1992-11-30 23:59:59.999999999 | 1992 | 1992-01-01 | 1992-12-31 23:59:59.999999999 |
Series.isin(values)
df = raw_df.copy()
df['is_good'] = df['name'].isin(['吴柳', '罗峰']).astype('int8')
df
name | gender | age | date | amount | postcode | cnt | is_good | |
---|---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN | 0 |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 | 1 |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN | 0 |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 | 1 |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN | 0 |
Series.between(left, right, inclusive='both')
df = raw_df.copy()
df['amount_5k_1w'] = df['amount'].between(5000, 10000, inclusive='left')
df
name | gender | age | date | amount | postcode | cnt | amount_5k_1w | |
---|---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN | False |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 | False |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN | False |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 | False |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN | True |
DataFrame.nlargest(n, columns, keep='first')
DataFrame.nsmallest(n, columns, keep='first')
df = raw_df.copy()
df.nlargest(3, 'amount')
name | gender | age | date | amount | postcode | cnt | |
---|---|---|---|---|---|---|---|
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 |
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN |
pandas.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise', ordered=True)
pandas.qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
df = raw_df.copy()
bins = [0, 30, 60, 100]
labels = ['青', '中', '老']
df['age_group'] = pd.cut(df['age'], bins, labels=labels)
df
name | gender | age | date | amount | postcode | cnt | age_group | |
---|---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN | 青 |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 | 中 |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN | 老 |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 | 中 |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN | 老 |
DataFrame.transform(func, axis=0, *args, **kwargs)
df = raw_df.copy()
df['amount_mean'] = df.groupby('gender')['amount'].transform('mean')
df
name | gender | age | date | amount | postcode | cnt | amount_mean | |
---|---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN | 2219.333333 |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 | 2219.333333 |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN | 3890.500000 |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 | 2219.333333 |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN | 3890.500000 |
Series.map(arg, na_action=None)
df = raw_df.copy()
df['gender_new'] = df['gender'].map({'M': 1, 'F': 0})
df['amount_new'] = df['amount'].map(lambda x: round(x/1000, 2))
df['amount_grp'] = df['amount'].map(lambda x: 'low' if x < 5000 else 'high')
df
name | gender | age | date | amount | postcode | cnt | gender_new | amount_new | amount_grp | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN | 0 | 1.35 | low |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 | 0 | 1.15 | low |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN | 1 | 0.28 | low |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 | 0 | 4.15 | low |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN | 1 | 7.50 | high |
Series.apply(func, convert_dtype=True, args=(), **kwargs)
df = raw_df.copy()
def add_value(x, value):
return x + value
df['age_new'] = df['age'].apply(add_value, value=1)
df
name | gender | age | date | amount | postcode | cnt | age_new | |
---|---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN | 19 |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 | 32 |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN | 63 |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 | 35 |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN | 79 |
GroupBy.agg(func, *args, **kwargs)
df = raw_df.copy()
grouped = df.groupby(by=['gender']).agg({'amount': [('amount_max', 'max'), ('amount_min', 'min')], 'age': [('age_median', 'median'), ('age_min','min')], 'name': [('cnt', 'nunique')]})
grouped.columns = grouped.columns.droplevel()
grouped
amount_max | amount_min | age_median | age_min | cnt | |
---|---|---|---|---|---|
gender | |||||
F | 4154 | 1152 | 31.0 | 18 | 3 |
M | 7501 | 280 | 70.0 | 62 | 2 |
DataFrame.pivot_table(values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All', observed=False, sort=True)
df = raw_df.copy()
df.pivot_table(values=['amount'], index=['date'], columns=['gender'], aggfunc='sum')
amount | ||
---|---|---|
gender | F | M |
date | ||
1988-10-18 | NaN | 280.0 |
1992-11-05 | NaN | 7501.0 |
1995-10-02 | 4154.0 | NaN |
2009-04-11 | 1152.0 | NaN |
2012-04-06 | 1352.0 | NaN |
Series.cumsum(axis=None, skipna=True, *args, **kwargs)
df = raw_df.copy()
df['amount_cumsum'] = df['amount'].cumsum()
df
name | gender | age | date | amount | postcode | cnt | amount_cumsum | |
---|---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN | 1352 |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 | 2504 |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN | 2784 |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 | 6938 |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN | 14439 |
df = raw_df.copy()
df['amount_cumsum'] = df.groupby(by=['gender'])['amount'].cumsum()
df
name | gender | age | date | amount | postcode | cnt | amount_cumsum | |
---|---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN | 1352 |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 | 2504 |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN | 280 |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 | 6658 |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN | 7781 |
Series.pct_change(periods=1, fill_method='pad', limit=None, freq=None, **kwargs)
df = raw_df.copy()
df['amount_pct_change'] = df['amount'].pct_change()
df
name | gender | age | date | amount | postcode | cnt | amount_pct_change | |
---|---|---|---|---|---|---|---|---|
0 | 刘倩 | F | 18 | 2012-04-06 | 1352 | 855589 | NaN | NaN |
1 | 吴柳 | F | 31 | 2009-04-11 | 1152 | 629635 | 10.0 | -0.147929 |
2 | 陈桂芳 | M | 62 | 1988-10-18 | 280 | 124556 | NaN | -0.756944 |
3 | 罗峰 | F | 34 | 1995-10-02 | 4154 | 381814 | 20.0 | 13.835714 |
4 | 黄雪 | M | 78 | 1992-11-05 | 7501 | 514273 | NaN | 0.805729 |
Series.nunique(dropna=True)
df = raw_df.copy()
df['gender'].nunique()
2
DataFrame.corr(method='pearson', min_periods=1)
Series.corr(other, method='pearson', min_periods=None)
df = raw_df.copy()
df.corr()
age | amount | cnt | |
---|---|---|---|
age | 1.000000 | 0.548072 | 1.0 |
amount | 0.548072 | 1.000000 | 1.0 |
cnt | 1.000000 | 1.000000 | 1.0 |