概述
- df.drop_duplicates()
DataFrame.drop_duplicates(self, subset=None, keep=‘first’, inplace=False)
参数:
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by default use all of the columns
keep : {‘first’, ‘last’, False}, default ‘first’
first : Drop duplicates except for the first occurrence.
last : Drop duplicates except for the last occurrence.
False : Drop all duplicates.
inplace : boolean, default False
Whether to drop duplicates in place or to return a copy
返回: DataFrame
- df.duplicated()参数
DataFrame.duplicated(self, subset=None, keep=‘first’)[source]
参数:
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by default use all of the columns
keep : {‘first’, ‘last’, False}, default ‘first’
first : Mark duplicates as True except for the first occurrence.
last : Mark duplicates as True except for the last occurrence.
False : Mark all duplicates as True.
返回值: Series
import
pandas as pd
import numpy as np
import random
np.set_printoptions(precision=3, suppress=True)
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
pd.set_option('precision', 5) #设置精度
pd.set_option('display.float_format', lambda x: '%.0f' % x) #为了直观的显示数字,不采用科学计数法
'''
drop_repeat_infor:删除完全相同的字段repeat_infor的数据条目,区分大小写
df_data_origin为原始数据(dateframe),repeat_infor是目标字段名称(str)
'''
def drop_repeat_infor(df_data_origin,repeat_infor):
df_drop_repeat_infor = df_data_origin.drop_duplicates([repeat_infor])
return df_drop_repeat_infor
'''
get_case_insensentive_repeat_infor:不区分字段repeat_infor大小写,找出重复字段repeat_infor的数据
保留所有重复值,duplicated(subset=[repeat_infor],keep = False)
保留一条重复值,duplicated(subset=[repeat_infor],keep = 'first')
'''
def get_case_insensentive_repeat_infor(df_data,repeat_infor,ref_infor):
#去重
df_drop_repeat_infor = df_data.drop_duplicates(subset=[repeat_infor])
#字段repeat_infor改为小写
#
ref_infor_lower = repeat_infor+'_lower'
df_drop_repeat_infor_lower = pd.DataFrame(columns = [repeat_infor])
df_drop_repeat_infor_lower[repeat_infor] = df_drop_repeat_infor[repeat_infor].astype(str).str.lower()
#查找去重后,不区分大小写的所有重复值
df_repeat_infor_lower = df_drop_repeat_infor[df_drop_repeat_infor_lower.duplicated(subset=[repeat_infor],keep = False)]
return df_repeat_infor_lower
'''
def get_similar_infor_data:找出参考字段(ref_infor)相同,关心字段(repeat_infor)不同的数据条目
df_data数据(dateframe)
'''
def get_similar_infor_data(df_data,repeat_infor,ref_infor):
#去重
df_drop_repeat_infor = df_data.drop_duplicates([repeat_infor])
#按照参考字段统计关心字段条目数
df_counts = df_drop_repeat_infor.groupby([ref_infor]).size().reset_index(name='counts')
#选择counts>1的条目
df_similar_infor_data = df_counts[df_counts['counts']>1]
return df_similar_infor_data
#导入数据
TFS_data_origin = pd.read_csv("data_input/TFS_需求清单项目名称.csv",encoding = 'gbk')
#完全相同的项目名称去重
TFS_drop_repeat_infor_temp = pd.DataFrame(columns=['project_code','project_name'])
TFS_drop_repeat_infor_temp = drop_repeat_infor(TFS_data_origin,'project_name')
#排序
TFS_drop_repeat_infor = TFS_drop_repeat_infor_temp.sort_values('project_name',ascending=True,na_position='first')#na_position='last' NaN放在最后
#针对去重的项目名称,不区分大小写,找出重复的项目名称的数据
TFS_drop_repeat_infor_lower = pd.DataFrame(columns=['project_name_lower'])
TFS_drop_repeat_infor_lower = get_case_insensentive_repeat_infor(TFS_data_origin,'project_name','project_code')
print(TFS_drop_repeat_infor_lower)
#查找相同项目id不同项目名称的数据
TFS_get_similar_infor_data = get_similar_infor_data(TFS_data_origin,'project_name','project_code')
#获取项目名称和id
list_temp = TFS_get_similar_infor_data['project_code'].astype(str).tolist()
str_similar_id = ','.join(list_temp)
TFS_similar_name = TFS_drop_repeat_infor[TFS_drop_repeat_infor['project_code'].apply(lambda x:str(x) in (str_similar_id))]
print(TFS_similar_name)
参考文档
参考文档
参考文档
最后
以上就是贪玩秋天为你收集整理的python 利用df.drop_duplicates()和df.duplicated()实现查找某字段取值不标准的数据的全部内容,希望文章能够帮你解决python 利用df.drop_duplicates()和df.duplicated()实现查找某字段取值不标准的数据所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复