
sidebar.wechat

sidebar.feishu
sidebar.chooseYourWayToJoin

sidebar.scanToAddConsultant
掌握 Canvas 的高级技巧,可以让你的数据分析效率提升 10 倍。本文将分享 Canvas 的高级用法和实战经验。
数据清洗:
import pandas as pd
import numpy as np
# 输入数据
df = input_data
# 删除重复行
df = df.drop_duplicates()
# 处理缺失值
df['amount'] = df['amount'].fillna(0)
df['category'] = df['category'].fillna('未分类')
# 删除异常值(使用 IQR 方法)
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['amount'] >= Q1 - 1.5*IQR) & (df['amount'] <= Q3 + 1.5*IQR)]
# 输出清洗后的数据
output_data = df
数据转换:
# 日期处理
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['weekday'] = df['date'].dt.day_name()
# 分类编码
df['category_code'] = pd.Categorical(df['category']).codes
# 数值分箱
df['amount_range'] = pd.cut(df['amount'],
bins=[0, 100, 500, 1000, float('inf')],
labels=['低', '中', '高', '极高'])
output_data = df
数据聚合:
# 多维度聚合
result = df.groupby(['category', 'region']).agg({
'amount': ['sum', 'mean', 'count'],
'quantity': 'sum'
}).reset_index()
# 重命名列
result.columns = ['category', 'region', 'total_amount', 'avg_amount', 'order_count', 'total_quantity']
output_data = result
描述性统计:
import pandas as pd
# 基础统计
stats = df['amount'].describe()
# 自定义统计
result = pd.DataFrame({
'metric': ['总数', '总和', '平均值', '中位数', '标准差', '最小值', '最大值'],
'value': [
df['amount'].count(),
df['amount'].sum(),
df['amount'].mean(),
df['amount'].median(),
df['amount'].std(),
df['amount'].min(),
df['amount'].max()
]
})
output_data = result
相关性分析:
# 计算相关系数
correlation = df[['amount', 'quantity', 'price']].corr()
# 转换为长格式
result = correlation.reset_index().melt(id_vars='index')
result.columns = ['variable1', 'variable2', 'correlation']
output_data = result
趋势分析:
# 移动平均
df['ma_7'] = df['amount'].rolling(window=7).mean()
df['ma_30'] = df['amount'].rolling(window=30).mean()
# 同比增长
df['yoy_growth'] = df['amount'].pct_change(periods=12) * 100
# 环比增长
df['mom_growth'] = df['amount'].pct_change() * 100
output_data = df
简单预测:
from sklearn.linear_model import LinearRegression
import numpy as np
# 准备数据
X = df[['feature1', 'feature2']].values
y = df['target'].values
# 训练模型
model = LinearRegression()
model.fit(X, y)
# 预测
df['predicted'] = model.predict(X)
df['residual'] = df['target'] - df['predicted']
output_data = df
聚类分析:
from sklearn.cluster import KMeans
# 选择特征
features = df[['amount', 'frequency', 'recency']]
# 标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
# 聚类
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(features_scaled)
output_data = df
折线图高级配置:
{
"type": "line",
"data": {
"x": "date",
"y": ["sales", "target"],
"series": ["实际销售", "目标"]
},
"options": {
"title": "销售额 vs 目标",
"xAxis": {
"label": "日期",
"format": "YYYY-MM-DD"
},
"yAxis": {
"label": "金额(元)",
"format": ",.0f"
},
"legend": {
"position": "top"
},
"colors": ["#4ecdc4", "#ff6b6b"],
"smooth": true,
"showDataLabels": false
}
}
组合图表:
{
"type": "combo",
"data": {
"x": "month",
"series": [
{
"name": "销售额",
"type": "bar",
"y": "amount"
},
{
"name": "增长率",
"type": "line",
"y": "growth_rate",
"yAxisIndex": 1
}
]
},
"options": {
"yAxis": [
{"label": "销售额(元)"},
{"label": "增长率(%)", "position": "right"}
]
}
}
表格条件格式:
# 在 Python Node 中添加格式化信息
df['amount_color'] = df['amount'].apply(
lambda x: 'green' if x > 1000 else 'red' if x < 100 else 'black'
)
df['status_icon'] = df['status'].map({
'completed': '✅',
'pending': '⏳',
'failed': '❌'
})
output_data = df
根据数据动态选择图表类型:
# 判断数据特征
if len(df) < 10:
chart_type = 'bar'
elif df['category'].nunique() > 10:
chart_type = 'line'
else:
chart_type = 'pie'
# 输出图表配置
output_data = {
'data': df,
'chart_type': chart_type
}
场景:合并来自不同 Data Node 的数据
方法:
# 假设有两个输入
# input_data_1: 订单数据
# input_data_2: 用户数据
import pandas as pd
orders = input_data_1
users = input_data_2
# 左连接
result = orders.merge(users, left_on='user_id', right_on='id', how='left')
# 选择需要的列
result = result[['order_id', 'user_name', 'amount', 'created_at']]
output_data = result
重采样:
# 设置日期索引
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
# 按周聚合
weekly = df.resample('W').agg({
'amount': 'sum',
'quantity': 'sum',
'order_id': 'count'
})
# 重命名
weekly.columns = ['weekly_amount', 'weekly_quantity', 'order_count']
output_data = weekly.reset_index()
滚动窗口:
# 7 天移动平均
df['ma_7'] = df['amount'].rolling(window=7, min_periods=1).mean()
# 累计和
df['cumsum'] = df['amount'].cumsum()
# 同比
df['yoy'] = df['amount'].pct_change(periods=365) * 100
output_data = df
创建透视表:
# 透视表
pivot = df.pivot_table(
values='amount',
index='category',
columns='region',
aggfunc='sum',
fill_value=0
)
# 添加总计
pivot['总计'] = pivot.sum(axis=1)
pivot.loc['总计'] = pivot.sum(axis=0)
output_data = pivot.reset_index()
限制查询数据量:
-- 在 Data Node 中
SELECT * FROM orders
WHERE created_at >= DATE_SUB(NOW(), INTERVAL 30 DAY)
LIMIT 10000
分页查询:
-- 第一页
SELECT * FROM orders
ORDER BY id
LIMIT 1000 OFFSET 0
-- 第二页
SELECT * FROM orders
ORDER BY id
LIMIT 1000 OFFSET 1000
向量化操作:
# 慢:循环
result = []
for i, row in df.iterrows():
result.append(row['a'] + row['b'])
df['c'] = result
# 快:向量化
df['c'] = df['a'] + df['b']
使用 apply 优化:
# 慢:逐行处理
df['result'] = df.apply(lambda row: complex_function(row), axis=1)
# 快:向量化处理
df['result'] = complex_function_vectorized(df)
数据类型优化:
# 优化数据类型
df['id'] = df['id'].astype('int32') # 从 int64 降到 int32
df['category'] = df['category'].astype('category') # 字符串转分类
df['amount'] = df['amount'].astype('float32') # 从 float64 降到 float32
output_data = df
分块处理:
# 处理大数据集
chunk_size = 10000
results = []
for chunk in pd.read_sql(query, connection, chunksize=chunk_size):
processed = process_chunk(chunk)
results.append(processed)
output_data = pd.concat(results, ignore_index=True)
创建可复用模板:
模板示例:
[数据源] → [数据清洗] → [数据分析] → [可视化]
↓
[导出 Excel]
添加节点注释:
"""
节点说明:计算用户留存率
输入:用户登录记录
输出:1日、7日、30日留存率
计算逻辑:
1. 按首次登录日期分组
2. 计算后续登录情况
3. 计算留存率
"""
# 代码实现
...
保存关键版本:
版本命名规范:
v1.0 - 初始版本
v1.1 - 添加地区分析
v1.2 - 优化性能
v2.0 - 重构数据流程
设置访问权限:
目标:分析用户从注册到购买的转化率
Canvas 设计:
Data Node 1: 查询用户注册数据
Data Node 2: 查询用户浏览数据
Data Node 3: 查询用户购买数据
↓
Python Node: 计算各阶段转化率
↓
Chart Node: 漏斗图
Python 代码:
import pandas as pd
# 合并数据
registered = len(input_data_1)
browsed = len(input_data_2)
purchased = len(input_data_3)
# 计算转化率
funnel = pd.DataFrame({
'stage': ['注册', '浏览', '购买'],
'count': [registered, browsed, purchased],
'rate': [
100,
browsed / registered * 100,
purchased / registered * 100
]
})
output_data = funnel
目标:根据 RFM 模型对客户分群
Canvas 设计:
Data Node: 查询订单数据
↓
Python Node: 计算 RFM 指标
↓
Python Node: 客户分群
↓
Chart Node: 分群分布图
Python 代码:
import pandas as pd
from datetime import datetime
# 计算 RFM
now = datetime.now()
rfm = df.groupby('customer_id').agg({
'order_date': lambda x: (now - x.max()).days, # Recency
'order_id': 'count', # Frequency
'amount': 'sum' # Monetary
})
rfm.columns = ['recency', 'frequency', 'monetary']
# 分群(简单分箱)
rfm['r_score'] = pd.qcut(rfm['recency'], 4, labels=[4,3,2,1])
rfm['f_score'] = pd.qcut(rfm['frequency'], 4, labels=[1,2,3,4])
rfm['m_score'] = pd.qcut(rfm['monetary'], 4, labels=[1,2,3,4])
# 计算总分
rfm['rfm_score'] = rfm['r_score'].astype(int) + rfm['f_score'].astype(int) + rfm['m_score'].astype(int)
# 客户分类
def classify_customer(score):
if score >= 10:
return '重要价值客户'
elif score >= 8:
return '重要发展客户'
elif score >= 6:
return '重要保持客户'
else:
return '一般客户'
rfm['segment'] = rfm['rfm_score'].apply(classify_customer)
output_data = rfm.reset_index()
目标:检测销售数据中的异常值
Python 代码:
import pandas as pd
import numpy as np
from scipy import stats
# Z-score 方法
df['z_score'] = np.abs(stats.zscore(df['amount']))
df['is_outlier_z'] = df['z_score'] > 3
# IQR 方法
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1
df['is_outlier_iqr'] = (df['amount'] < Q1 - 1.5*IQR) | (df['amount'] > Q3 + 1.5*IQR)
# 标记异常
df['is_outlier'] = df['is_outlier_z'] | df['is_outlier_iqr']
output_data = df
原因:
解决方案:
# 1. 限制数据量
df = df.head(10000)
# 2. 使用向量化操作
df['result'] = df['a'] + df['b'] # 而不是循环
# 3. 使用更高效的算法
# 慢
result = df.apply(lambda x: slow_function(x), axis=1)
# 快
result = fast_vectorized_function(df)
解决方案:
# 1. 优化数据类型
df = df.astype({
'id': 'int32',
'category': 'category',
'amount': 'float32'
})
# 2. 删除不需要的列
df = df[['id', 'amount', 'date']]
# 3. 分块处理
for chunk in pd.read_sql(query, con, chunksize=1000):
process(chunk)
检查清单:
Canvas 高级技巧的核心:
Python Node: ✅ 掌握数据处理技巧 ✅ 使用统计分析方法 ✅ 应用机器学习算法
图表定制: ✅ 自定义图表配置 ✅ 使用条件格式 ✅ 创建动态图表
性能优化: ✅ 限制数据量 ✅ 向量化操作 ✅ 优化数据类型
团队协作: ✅ 创建可复用模板 ✅ 添加详细注释 ✅ 管理版本历史
下一步:
相关阅读:
技术交流:
sidebar.noProgrammingNeeded
sidebar.startFreeTrial