
sidebar.wechat

sidebar.feishu
sidebar.chooseYourWayToJoin

sidebar.scanToAddConsultant
In AI data analysis scenarios, data security is paramount. Enterprise data often contains sensitive information: user phone numbers, ID card numbers, bank card numbers, addresses, etc. How can we protect this sensitive data from leakage while ensuring AI inference accuracy?
AskTable's SDI (Secure Data Inference) technology achieves high-performance, high-security data masking through field-level masking + Faker generation + Vault mapping.
This article deeply analyzes the design and implementation of this solution.
Scenario: User asks "What did the user with phone number 138xxxx1234 purchase?"
Traditional Approach:
# Directly pass sensitive data to LLM
prompt = f"Data: {dataframe.to_string()}\nQuestion: {question}"
Risks:
Complete Masking:
# Replace sensitive fields with ***
df["phone"] = "***"
Problems:
Encryption:
# Encrypt sensitive data
df["phone"] = encrypt(df["phone"])
Problems:
class IdentifiableType(StrEnum):
"""Sensitive field types"""
NONE = "none" # Non-sensitive
PHONE = "phone" # Phone number
ID_CARD = "id_card" # ID card number
EMAIL = "email" # Email
ADDRESS = "address" # Address
NAME = "name" # Name
BANK_CARD = "bank_card" # Bank card number
Metadata Tagging:
field = {
"name": "phone",
"type": "VARCHAR",
"identifiable_type": "phone", # Tagged as sensitive field
}
class SecureDataFrame:
"""Secure data framework: DataFrame with masking support"""
def __init__(self, df: pd.DataFrame, vault: Vault):
self.df = df
self.vault = vault # Mapping relationship manager
def to_str(self, anonymize: bool = True) -> str:
"""Convert to string, support masking"""
if not anonymize:
return self.df.to_string()
# Masking processing
anonymized_df = self.df.copy()
for col in self.df.columns:
if self.vault.is_sensitive(col):
anonymized_df[col] = anonymized_df[col].apply(
lambda x: self.vault.anonymize(col, x)
)
return anonymized_df.to_string()
class Vault:
"""Mapping relationship manager: Original value ↔ Fake value"""
def __init__(self):
self._forward_map: dict[str, dict[str, str]] = {} # Original value -> Fake value
self._reverse_map: dict[str, dict[str, str]] = {} # Fake value -> Original value
self._faker = Faker("zh_CN")
def anonymize(self, field: str, value: str) -> str:
"""Mask: Original value -> Fake value"""
if value in self._forward_map.get(field, {}):
return self._forward_map[field][value]
# Generate fake value
fake_value = self._generate_fake_value(field, value)
# Save mapping relationship
if field not in self._forward_map:
self._forward_map[field] = {}
self._reverse_map[field] = {}
self._forward_map[field][value] = fake_value
self._reverse_map[field][fake_value] = value
return fake_value
def deanonymize(self, field: str, fake_value: str) -> str:
"""Restore: Fake value -> Original value"""
return self._reverse_map.get(field, {}).get(fake_value, fake_value)
def _generate_fake_value(self, field: str, value: str) -> str:
"""Generate fake value based on field type"""
field_type = self._get_field_type(field)
if field_type == "phone":
return self._faker.phone_number()
elif field_type == "email":
return self._faker.email()
elif field_type == "name":
return self._faker.name()
elif field_type == "address":
return self._faker.address()
elif field_type == "id_card":
return self._faker.ssn()
else:
return "***"
async def query_with_sdi(question: str, datasource: DataSourceAdmin) -> QueryResult:
"""Query process with SDI"""
# 1. Generate SQL
sql = await generate_sql(question, datasource)
# 2. Execute SQL, get original data
raw_df = await datasource.execute_sql(sql)
# 3. Create Vault
vault = Vault()
for field in datasource.get_sensitive_fields():
vault.register_field(field.name, field.identifiable_type)
# 4. Create SecureDataFrame
secure_df = SecureDataFrame(raw_df, vault)
# 5. Pass to LLM after masking (for generating explanation)
anonymized_data = secure_df.to_str(anonymize=True)
explanation = await generate_explanation(question, anonymized_data)
# 6. Return original data to user (not masked)
return QueryResult(
sql=sql,
dataframe=raw_df, # Original data
explanation=explanation, # Explanation generated from masked data
)
from faker import Faker
faker = Faker("zh_CN")
# Generate fake phone number
fake_phone = faker.phone_number()
# Output: 138-1234-5678
# Generate fake name
fake_name = faker.name()
# Output: Zhang Wei
# Generate fake address
fake_address = faker.address()
# Output: 88 Jianguo Road, Chaoyang District, Beijing
# Generate fake email
fake_email = faker.email()
# Output: zhangwei@example.com
Key Points:
Faker("zh_CN") generates Chinese data# Example: Mask phone number
vault = Vault()
vault.register_field("phone", "phone")
# Original value -> Fake value
fake1 = vault.anonymize("phone", "138-1234-5678")
# Output: 156-7890-1234
# Same original value -> Same fake value (maintain consistency)
fake2 = vault.anonymize("phone", "138-1234-5678")
# Output: 156-7890-1234 (same as fake1)
# Fake value -> Original value
original = vault.deanonymize("phone", "156-7890-1234")
# Output: 138-1234-5678
Key Points:
| Solution | Performance | Accuracy | Reversibility | Deployment Cost |
|---|---|---|---|---|
| Faker | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
| Local Small Model | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ |
| Encryption | ⭐⭐⭐⭐⭐ | ⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
| Complete Masking | ⭐⭐⭐⭐⭐ | ⭐ | ⭐ | ⭐⭐⭐⭐⭐ |
Performance:
# Faker generation speed
import time
start = time.time()
for _ in range(10000):
faker.phone_number()
print(f"Time: {time.time() - start:.2f}s")
# Output: Time: 0.15s (10000 generations)
Accuracy:
Reversibility:
Deployment Cost:
Problems:
Conclusion: For masking scenarios, Faker is the better choice.
# Original data
df = pd.DataFrame({
"name": ["Zhang San", "Li Si"],
"phone": ["138-1234-5678", "139-8765-4321"],
"amount": [1000, 2000],
})
# Create Vault
vault = Vault()
vault.register_field("phone", "phone")
# Mask
secure_df = SecureDataFrame(df, vault)
anonymized_str = secure_df.to_str(anonymize=True)
# Output (after masking)
"""
name phone amount
0 Zhang San 156-7890-1234 1000
1 Li Si 157-1234-5678 2000
"""
# Pass to LLM
explanation = await generate_explanation(question, anonymized_str)
# Return to user (original data)
return df # Contains real phone numbers
# Original data
df = pd.DataFrame({
"name": ["Zhang San", "Li Si"],
"phone": ["138-1234-5678", "139-8765-4321"],
"email": ["zhangsan@example.com", "lisi@example.com"],
"address": ["Chaoyang District, Beijing", "Pudong New Area, Shanghai"],
})
# Create Vault
vault = Vault()
vault.register_field("name", "name")
vault.register_field("phone", "phone")
vault.register_field("email", "email")
vault.register_field("address", "address")
# Mask
secure_df = SecureDataFrame(df, vault)
anonymized_str = secure_df.to_str(anonymize=True)
# Output (after masking)
"""
name phone email address
0 Wang Wei 156-7890-1234 wangwei@example.com Tianhe District, Guangzhou
1 Liu Yang 157-1234-5678 liuyang@example.com Nanshan District, Shenzhen
"""
class Vault:
def __init__(self):
self._cache: dict[str, str] = {} # Cache mapping relationships
def anonymize(self, field: str, value: str) -> str:
cache_key = f"{field}:{value}"
if cache_key in self._cache:
return self._cache[cache_key]
fake_value = self._generate_fake_value(field, value)
self._cache[cache_key] = fake_value
return fake_value
Effect:
def anonymize_batch(self, field: str, values: list[str]) -> list[str]:
"""Batch masking"""
return [self.anonymize(field, v) for v in values]
Effect:
# Each session uses independent Vault
session_vaults: dict[str, Vault] = {}
def get_vault(session_id: str) -> Vault:
if session_id not in session_vaults:
session_vaults[session_id] = Vault()
return session_vaults[session_id]
Effect:
class EncryptedVault(Vault):
def __init__(self, encryption_key: str):
super().__init__()
self.cipher = Fernet(encryption_key)
def save_to_disk(self, path: str):
"""Encrypt and save mapping table"""
data = json.dumps(self._forward_map)
encrypted_data = self.cipher.encrypt(data.encode())
with open(path, "wb") as f:
f.write(encrypted_data)
Effect:
AskTable's SDI technology achieves through the combination of Faker + Vault + SecureDataFrame:
✅ High Security: Sensitive data not exposed to LLM ✅ High Performance: Faker generation speed fast (< 1ms) ✅ High Accuracy: Does not affect AI inference quality ✅ Low Cost: No GPU needed, simple deployment
Related Reading:
Technical Exchange:
sidebar.noProgrammingNeeded
sidebar.startFreeTrial