# 导入包
import pandas as pd
from faker import Faker


# 伪造数据用于示例
faker = Faker(locale='zh-CN')
Faker.seed(12)
raw_df = pd.DataFrame({
    'name': [faker.name() for x in range(5)],
    'email': [faker.email() for x in range(5)],
    'address': [faker.address() for x in range(5)],
    'code': [str(faker.random.randint(1, 10000)) for x in range(5)]
})
raw_df


df = raw_df.copy()
df['name_len'] = df['name'].str.len()
df[['name', 'name_len']]


df = raw_df.copy()
df['email_domain'] = df['email'].str.split('@', expand=True)[1]
df[['email', 'email_domain']]


df = raw_df.copy()
df2 = df['email'].str.split('@', expand=True)
df2.columns = ['email_prefix', 'email_suffix']
df3 = pd.concat([df, df2], axis=1)
df3


df = raw_df.copy()
df['string1'] = [' xyz', 'abc  ', ' efg ', '123', '']
df['string2'] = df['string1'].str.strip()
df['string1_len'] = df['string1'].str.len()
df['string2_len'] = df['string2'].str.len()
df[['string1', 'string2', 'string1_len', 'string2_len']]


df = raw_df.copy()
df['is_email_org'] = df['email'].str.contains('.org', regex=False)
df[['email', 'is_email_org']]


df = raw_df.copy()
df['is_province_Hunan'] = df['address'].str.startswith('湖南')
df['is_email_org'] = df['email'].str.endswith('.org')
df[['email', 'address', 'is_province_Hunan', 'is_email_org']]


df = raw_df.copy()
df['string1'] = [' xyz', 'abc  ', 'e f g', '\t', '\n']
df['string2'] = df['string1'].str.replace('abc', 'ABC')  # 替换abc为ABC
df['string3'] = df['string1'].str.replace(' ', '')  # 删除空格
df['string4'] = df['string1'].str.replace('\s', '', regex=True)  # 删除空格、换行符、制表符等空白符
df[['string1', 'string2', 'string3', 'string4']]


df = raw_df.copy()
intab = '745'
outab = 'ABC'
tab = str.maketrans(intab, outab)
df['code_new'] = df['code'].str.translate(tab)  # 将7替换为A，将4替换为B，将5替换为C
df[['code', 'code_new']]


df = raw_df.copy()
df['province1'] = df['address'].str.extract(r'(.*省)', expand=True)[0]  # 提取省份
df['province2'] = df['address'].str.extract(r'(.*自治区)', expand=True)[0]  # 提取自治区
df['province3'] = df['address'].str.extract(r'(^.{2}市)', expand=True)[0]  # 提取直辖市
df['province'] = df['province1']  # 将省、自治区、直辖市合并到一列
df['province'].fillna(df['province2'], inplace=True)
df['province'].fillna(df['province3'], inplace=True)
df['postcode'] = df['address'].str.extract(r'(\d{6})', expand=True)[0]  # 提取6位邮编
df[['address', 'province1', 'province2', 'province3', 'province', 'postcode']]


df = raw_df.copy()
df['space_position'] = df['address'].str.find(' ')  # 空格在第几位，从0开始
df[['address', 'space_position']]


df = raw_df.copy()
df['is_email_numeric'] = df['address'].str.isnumeric()
df['is_code_numeric'] = df['code'].str.isnumeric()
df[['email', 'code', 'is_email_numeric', 'is_code_numeric']]


df = raw_df.copy()
df['code_new'] = df['code'].str.zfill(6)
df[['code', 'code_new']]

	name	email	address	code
0	刘倩	fangxu@example.org	黑龙江省成都市崇文马路P座 534556	8441
1	吴柳	weiyong@example.com	湖南省北镇县南溪海门路H座 236063	7293
2	陈桂芳	bdu@example.org	安徽省兴安盟市门头沟杨街x座 454777	3508
3	罗峰	gangshen@example.com	新疆维吾尔自治区博市沈河俞路p座 680391	4670
4	黄雪	zhangping@example.org	上海市萍市魏都曹路V座 674904	8536

	name	name_len
0	刘倩	2
1	吴柳	2
2	陈桂芳	3
3	罗峰	2
4	黄雪	2

	email	email_domain
0	fangxu@example.org	example.org
1	weiyong@example.com	example.com
2	bdu@example.org	example.org
3	gangshen@example.com	example.com
4	zhangping@example.org	example.org

	name	email	address	code	email_prefix	email_suffix
0	刘倩	fangxu@example.org	黑龙江省成都市崇文马路P座 534556	8441	fangxu	example.org
1	吴柳	weiyong@example.com	湖南省北镇县南溪海门路H座 236063	7293	weiyong	example.com
2	陈桂芳	bdu@example.org	安徽省兴安盟市门头沟杨街x座 454777	3508	bdu	example.org
3	罗峰	gangshen@example.com	新疆维吾尔自治区博市沈河俞路p座 680391	4670	gangshen	example.com
4	黄雪	zhangping@example.org	上海市萍市魏都曹路V座 674904	8536	zhangping	example.org

	code	code_new
0	8441	8BB1
1	7293	A293
2	3508	3C08
3	4670	B6A0
4	8536	8C36

pandas.Series.str字符串常用方法总结¶

计算字符串长度：¶

拆分字符串：¶

去除字符串首尾的空格或换行符或指定字符：¶

判断字符串是否包含指定子字符串：¶

判断字符串开头或结尾是否为指定子字符串：¶

替换字符串中符合指定规则的子字符串：¶

一对一替换字符串中的字符：¶

从字符串中提取符合指定规则的子字符串：¶

计算指定字符在字符串中的位置：¶

判断字符串是否为数值：¶

数值字符串前位补0：¶

	email	is_email_org
0	fangxu@example.org	True
1	weiyong@example.com	False
2	bdu@example.org	True
3	gangshen@example.com	False
4	zhangping@example.org	True

	address	province1	province2	province3	province	postcode
0	黑龙江省成都市崇文马路P座 534556	黑龙江省	NaN	NaN	黑龙江省	534556
1	湖南省北镇县南溪海门路H座 236063	湖南省	NaN	NaN	湖南省	236063
2	安徽省兴安盟市门头沟杨街x座 454777	安徽省	NaN	NaN	安徽省	454777
3	新疆维吾尔自治区博市沈河俞路p座 680391	NaN	新疆维吾尔自治区	NaN	新疆维吾尔自治区	680391
4	上海市萍市魏都曹路V座 674904	NaN	NaN	上海市	上海市	674904

	string1	string2	string1_len	string2_len
0	xyz	xyz	4	3
1	abc	abc	5	3
2	efg	efg	5	3
3	123	123	3	3
4			0	0

	string1	string2	string3	string4
0	xyz	xyz	xyz	xyz
1	abc	ABC	abc	abc
2	e f g	e f g	efg	efg
3	\t	\t	\t
4	\n	\n	\n