import pandas as pd
file_path = 'data.csv'
df = pd.read_csv(file_path)
print('数据基本信息:')
df.info()
rows, columns = df.shape
if rows > 0:
print('数据前几行信息:')
print(df.head().to_csv(sep='\t', na_rep='nan'))
else:
print('数据全部内容信息:')
print(df.to_csv(sep='\t', na_rep='nan'))
import pandas as pd
file_path = 'data.xlsx'
df = pd.read_excel(file_path)
print('数据基本信息:')
df.info()
rows, columns = df.shape
if rows > 0:
print('数据前几行信息:')
print(df.head().to_csv(sep='\t', na_rep='nan'))
else:
print('数据全部内容信息:')
print(df.to_csv(sep='\t', na_rep='nan'))
import pandas as pd
file_path = 'data.csv'
df = pd.read_csv(file_path)
missing_values = df.isnull().sum()
print("缺失值统计:")
print(missing_values)
df = df.dropna()
print('清洗后数据基本信息:')
df.info()
rows, columns = df.shape
if rows > 0:
print('清洗后数据前几行信息:')
print(df.head().to_csv(sep='\t', na_rep='nan'))
else:
print('清洗后数据全部内容信息:')
print(df.to_csv(sep='\t', na_rep='nan'))
import pandas as pd
file_path = 'data.csv'
df = pd.read_csv(file_path)
duplicate_rows = df.duplicated().sum()
print("重复值数量:", duplicate_rows)
df = df.drop_duplicates()
print('清洗后数据基本信息:')
df.info()
rows, columns = df.shape
if rows > 0:
print('清洗后数据前几行信息:')
print(df.head().to_csv(sep='\t', na_rep='nan'))
else:
print('清洗后数据全部内容信息:')
print(df.to_csv(sep='\t', na_rep='nan'))
import pandas as pd
import numpy as np
file_path = 'data.csv'
df = pd.read_csv(file_path)
column = 'column_name'
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
print('清洗后数据基本信息:')
df.info()
rows, columns = df.shape
if rows > 0:
print('清洗后数据前几行信息:')
print(df.head().to_csv(sep='\t', na_rep='nan'))
else:
print('清洗后数据全部内容信息:')
print(df.to_csv(sep='\t', na_rep='nan'))
import pandas as pd
file_path = 'data.csv'
df = pd.read_csv(file_path)
numeric_stats = df.select_dtypes(include=['number']).describe()
print("数值型数据描述性统计:")
print(numeric_stats)
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
freq_dist = df[col].value_counts()
print(f"\n{col} 频数分布:")
print(freq_dist)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
file_path = 'data.csv'
df = pd.read_csv(file_path)
correlation_matrix = df.corr()
print("相关性矩阵:")
print(correlation_matrix)
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
import pandas as pd
file_path = 'data.csv'
df = pd.read_csv(file_path)
grouped = df.groupby('column_name')
mean_values = grouped.mean()
print("分组后的均值:")
print(mean_values)
import pandas as pd
import matplotlib.pyplot as plt
file_path = 'data.csv'
df = pd.read_csv(file_path)
x = df['column_x']
y = df['column_y']
plt.plot(x, y)
plt.title('Line Plot')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
file_path = 'data.csv'
df = pd.read_csv(file_path)
column = 'column_name'
values = df[column].value_counts()
plt.bar(values.index, values)
plt.title('Bar Plot')
plt.xlabel('Categories')
plt.ylabel('Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
file_path = 'data.csv'
df = pd.read_csv(file_path)
x = df['column_x']
y = df['column_y']
plt.scatter(x, y)
plt.title('Scatter Plot')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.show()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
file_path = 'data.csv'
df = pd.read_csv(file_path)
column = 'column_name'
sns.boxplot(x=df[column])
plt.title('Box Plot')
plt.show()