
sidebar.wechat

sidebar.feishu
sidebar.chooseYourWayToJoin

sidebar.scanToAddConsultant
Mastering Canvas's advanced tips can improve your data analysis efficiency by 10x. This article shares Canvas's advanced usage and practical experience.
Data Cleaning:
import pandas as pd
import numpy as np
# Input data
df = input_data
# Remove duplicate rows
df = df.drop_duplicates()
# Handle missing values
df['amount'] = df['amount'].fillna(0)
df['category'] = df['category'].fillna('Uncategorized')
# Remove outliers (using IQR method)
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['amount'] >= Q1 - 1.5*IQR) & (df['amount'] <= Q3 + 1.5*IQR)]
# Output cleaned data
output_data = df
Data Transformation:
# Date processing
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['weekday'] = df['date'].dt.day_name()
# Category encoding
df['category_code'] = pd.Categorical(df['category']).codes
# Numeric binning
df['amount_range'] = pd.cut(df['amount'],
bins=[0, 100, 500, 1000, float('inf')],
labels=['Low', 'Medium', 'High', 'Very High'])
output_data = df
Data Aggregation:
# Multi-dimensional aggregation
result = df.groupby(['category', 'region']).agg({
'amount': ['sum', 'mean', 'count'],
'quantity': 'sum'
}).reset_index()
# Rename columns
result.columns = ['category', 'region', 'total_amount', 'avg_amount', 'order_count', 'total_quantity']
output_data = result
Descriptive Statistics:
import pandas as pd
# Basic statistics
stats = df['amount'].describe()
# Custom statistics
result = pd.DataFrame({
'metric': ['Count', 'Sum', 'Average', 'Median', 'Std Dev', 'Min', 'Max'],
'value': [
df['amount'].count(),
df['amount'].sum(),
df['amount'].mean(),
df['amount'].median(),
df['amount'].std(),
df['amount'].min(),
df['amount'].max()
]
})
output_data = result
Correlation Analysis:
# Calculate correlation coefficients
correlation = df[['amount', 'quantity', 'price']].corr()
# Convert to long format
result = correlation.reset_index().melt(id_vars='index')
result.columns = ['variable1', 'variable2', 'correlation']
output_data = result
Trend Analysis:
# Moving average
df['ma_7'] = df['amount'].rolling(window=7).mean()
df['ma_30'] = df['amount'].rolling(window=30).mean()
# Year-over-year growth
df['yoy_growth'] = df['amount'].pct_change(periods=12) * 100
# Month-over-month growth
df['mom_growth'] = df['amount'].pct_change() * 100
output_data = df
Simple Prediction:
from sklearn.linear_model import LinearRegression
import numpy as np
# Prepare data
X = df[['feature1', 'feature2']].values
y = df['target'].values
# Train model
model = LinearRegression()
model.fit(X, y)
# Predict
df['predicted'] = model.predict(X)
df['residual'] = df['target'] - df['predicted']
output_data = df
Clustering Analysis:
from sklearn.cluster import KMeans
# Select features
features = df[['amount', 'frequency', 'recency']]
# Standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
# Cluster
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(features_scaled)
output_data = df
Line Chart Advanced Configuration:
{
"type": "line",
"data": {
"x": "date",
"y": ["sales", "target"],
"series": ["Actual Sales", "Target"]
},
"options": {
"title": "Sales vs Target",
"xAxis": {
"label": "Date",
"format": "YYYY-MM-DD"
},
"yAxis": {
"label": "Amount (Yuan)",
"format": ",.0f"
},
"legend": {
"position": "top"
},
"colors": ["#4ecdc4", "#ff6b6b"],
"smooth": true,
"showDataLabels": false
}
}
Combo Chart:
{
"type": "combo",
"data": {
"x": "month",
"series": [
{
"name": "Sales Amount",
"type": "bar",
"y": "amount"
},
{
"name": "Growth Rate",
"type": "line",
"y": "growth_rate",
"yAxisIndex": 1
}
]
},
"options": {
"yAxis": [
{"label": "Sales Amount (Yuan)"},
{"label": "Growth Rate (%)", "position": "right"}
]
}
}
Table Conditional Formatting:
# Add formatting info in Python Node
df['amount_color'] = df['amount'].apply(
lambda x: 'green' if x > 1000 else 'red' if x < 100 else 'black'
)
df['status_icon'] = df['status'].map({
'completed': '✅',
'pending': '⏳',
'failed': '❌'
})
output_data = df
Dynamically Select Chart Type Based on Data:
# Determine data characteristics
if len(df) < 10:
chart_type = 'bar'
elif df['category'].nunique() > 10:
chart_type = 'line'
else:
chart_type = 'pie'
# Output chart configuration
output_data = {
'data': df,
'chart_type': chart_type
}
Scenario: Merge data from different Data Nodes
Method:
# Assume two inputs
# input_data_1: Order data
# input_data_2: User data
import pandas as pd
orders = input_data_1
users = input_data_2
# Left join
result = orders.merge(users, left_on='user_id', right_on='id', how='left')
# Select needed columns
result = result[['order_id', 'user_name', 'amount', 'created_at']]
output_data = result
Resampling:
# Set date index
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
# Aggregate by week
weekly = df.resample('W').agg({
'amount': 'sum',
'quantity': 'sum',
'order_id': 'count'
})
# Rename
weekly.columns = ['weekly_amount', 'weekly_quantity', 'order_count']
output_data = weekly.reset_index()
Rolling Window:
# 7-day moving average
df['ma_7'] = df['amount'].rolling(window=7, min_periods=1).mean()
# Cumulative sum
df['cumsum'] = df['amount'].cumsum()
# Year-over-year
df['yoy'] = df['amount'].pct_change(periods=365) * 100
output_data = df
Create Pivot Table:
# Pivot table
pivot = df.pivot_table(
values='amount',
index='category',
columns='region',
aggfunc='sum',
fill_value=0
)
# Add totals
pivot['Total'] = pivot.sum(axis=1)
pivot.loc['Total'] = pivot.sum(axis=0)
output_data = pivot.reset_index()
Limit Query Data Volume:
-- In Data Node
SELECT * FROM orders
WHERE created_at >= DATE_SUB(NOW(), INTERVAL 30 DAY)
LIMIT 10000
Paginated Query:
-- First page
SELECT * FROM orders
ORDER BY id
LIMIT 1000 OFFSET 0
-- Second page
SELECT * FROM orders
ORDER BY id
LIMIT 1000 OFFSET 1000
Vectorized Operations:
# Slow: Loop
result = []
for i, row in df.iterrows():
result.append(row['a'] + row['b'])
df['c'] = result
# Fast: Vectorized
df['c'] = df['a'] + df['b']
Optimize with Apply:
# Slow: Row-by-row processing
df['result'] = df.apply(lambda row: complex_function(row), axis=1)
# Fast: Vectorized processing
df['result'] = complex_function_vectorized(df)
Data Type Optimization:
# Optimize data types
df['id'] = df['id'].astype('int32') # Reduce from int64 to int32
df['category'] = df['category'].astype('category') # String to category
df['amount'] = df['amount'].astype('float32') # Reduce from float64 to float32
output_data = df
Chunk Processing:
# Process large datasets
chunk_size = 10000
results = []
for chunk in pd.read_sql(query, connection, chunksize=chunk_size):
processed = process_chunk(chunk)
results.append(processed)
output_data = pd.concat(results, ignore_index=True)
Create Reusable Templates:
Template Example:
[Data Source] → [Data Cleaning] → [Data Analysis] → [Visualization]
↓
[Export to Excel]
Add Node Comments:
"""
Node Description: Calculate User Retention Rate
Input: User login records
Output: 1-day, 7-day, 30-day retention rates
Calculation Logic:
1. Group by first login date
2. Calculate subsequent login status
3. Calculate retention rates
"""
# Code implementation
...
Save Important Versions:
Version Naming Convention:
v1.0 - Initial version
v1.1 - Added regional analysis
v1.2 - Optimized performance
v2.0 - Refactored data flow
Set Access Permissions:
Goal: Analyze user conversion rate from registration to purchase
Canvas Design:
Data Node 1: Query user registration data
Data Node 2: Query user browsing data
Data Node 3: Query user purchase data
↓
Python Node: Calculate conversion rates for each stage
↓
Chart Node: Funnel chart
Python Code:
import pandas as pd
# Merge data
registered = len(input_data_1)
browsed = len(input_data_2)
purchased = len(input_data_3)
# Calculate conversion rates
funnel = pd.DataFrame({
'stage': ['Register', 'Browse', 'Purchase'],
'count': [registered, browsed, purchased],
'rate': [
100,
browsed / registered * 100,
purchased / registered * 100
]
})
output_data = funnel
Goal: Segment customers based on RFM model
Canvas Design:
Data Node: Query order data
↓
Python Node: Calculate RFM metrics
↓
Python Node: Customer segmentation
↓
Chart Node: Segmentation distribution chart
Python Code:
import pandas as pd
from datetime import datetime
# Calculate RFM
now = datetime.now()
rfm = df.groupby('customer_id').agg({
'order_date': lambda x: (now - x.max()).days, # Recency
'order_id': 'count', # Frequency
'amount': 'sum' # Monetary
})
rfm.columns = ['recency', 'frequency', 'monetary']
# Segmentation (simple binning)
rfm['r_score'] = pd.qcut(rfm['recency'], 4, labels=[4,3,2,1])
rfm['f_score'] = pd.qcut(rfm['frequency'], 4, labels=[1,2,3,4])
rfm['m_score'] = pd.qcut(rfm['monetary'], 4, labels=[1,2,3,4])
# Calculate total score
rfm['rfm_score'] = rfm['r_score'].astype(int) + rfm['f_score'].astype(int) + rfm['m_score'].astype(int)
# Customer classification
def classify_customer(score):
if score >= 10:
return 'High Value Customer'
elif score >= 8:
return 'High Potential Customer'
elif score >= 6:
return 'High Retention Customer'
else:
return 'Regular Customer'
rfm['segment'] = rfm['rfm_score'].apply(classify_customer)
output_data = rfm.reset_index()
Goal: Detect outliers in sales data
Python Code:
import pandas as pd
import numpy as np
from scipy import stats
# Z-score method
df['z_score'] = np.abs(stats.zscore(df['amount']))
df['is_outlier_z'] = df['z_score'] > 3
# IQR method
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1
df['is_outlier_iqr'] = (df['amount'] < Q1 - 1.5*IQR) | (df['amount'] > Q3 + 1.5*IQR)
# Mark outliers
df['is_outlier'] = df['is_outlier_z'] | df['is_outlier_iqr']
output_data = df
Causes:
Solutions:
# 1. Limit data volume
df = df.head(10000)
# 2. Use vectorized operations
df['result'] = df['a'] + df['b'] # Instead of loop
# 3. Use more efficient algorithms
# Slow
result = df.apply(lambda x: slow_function(x), axis=1)
# Fast
result = fast_vectorized_function(df)
Solutions:
# 1. Optimize data types
df = df.astype({
'id': 'int32',
'category': 'category',
'amount': 'float32'
})
# 2. Delete unnecessary columns
df = df[['id', 'amount', 'date']]
# 3. Chunk processing
for chunk in pd.read_sql(query, con, chunksize=1000):
process(chunk)
Checklist:
Canvas advanced tips core:
Python Node: ✅ Master data processing techniques ✅ Use statistical analysis methods ✅ Apply machine learning algorithms
Chart Customization: ✅ Custom chart configuration ✅ Use conditional formatting ✅ Create dynamic charts
Performance Optimization: ✅ Limit data volume ✅ Vectorized operations ✅ Optimize data types
Team Collaboration: ✅ Create reusable templates ✅ Add detailed comments ✅ Manage version history
Next Steps:
Related Reading:
Technical Exchange:
sidebar.noProgrammingNeeded
sidebar.startFreeTrial