This LLM training data cleaning tutorial explains how to turn raw, messy text into training-ready corpora for large language models. It focuses on practical preprocessing steps—deduplication, noise removal, desensitization, normalization, and chunking—so the dataset is safer and more usable for pre-training.
LLM training data is often described as “massive scale”, “diverse types”, and “multi-domain coverage”. The goal is to help the model learn language rules, world knowledge, and reasoning ability by learning from large volumes of text.
LLM training data sources and why they matter
1) Public text libraries: structured, high-quality basic data
Examples include Wikipedia, academic paper libraries, and classic book libraries. These sources are usually cleaner and more standardized.
2) Web crawler data: broad coverage at scale
A large portion of LLM corpora comes from web crawls. This can include blogs, articles, and novels. Community signals (for example, high-rated links on Reddit) can sometimes correlate with higher quality.
3) Books and publications: in-depth knowledge
Books (especially non-fiction) can provide systematic and deep coverage of professional domains, which helps models learn complex topics.
4) Dialogue and interaction data: better conversational behavior
Some models incorporate dialogue data to better match human interaction patterns.
5) Code data: stronger programming ability
Open-source repositories (GitHub) and programming Q&A sites (Stack Overflow) help models learn “problem → code” correspondences.
Data preprocessing for LLMs: from raw to usable
Raw data cannot be used directly for training. In practice, LLM training data cleaning often includes:
- Cleaning: remove duplicates, meaningless characters (garbled text), and harmful content
- Desensitization: redact private data (emails, phone numbers, ID cards)
- Standardization: unify formats (case, punctuation), and filter low-quality text
This llm training data cleaning tutorial first shows a basic pipeline, then extends it to advanced needs such as privacy redaction, multilingual noise cleanup, long-text chunking, and semantic quality filtering.
Basic LLM training data cleaning pipeline in Python
This section demonstrates a practical batch pipeline:
- remove duplicates (hash-based)
- remove overly short texts
- remove URLs and HTML tags
- remove noisy symbols and extra spaces
- lowercase English text
- filter low-quality text using a punctuation-ratio heuristic
import re
import string
import hashlib
from typing import List, Tuple
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download NLTK resources (required for first run)
nltk.download('stopwords')
nltk.download('punkt')
class LLMDataCleaner:
def __init__(self):
"""Initialize data cleaning utility and load resources like stop words"""
self.stop_words = set(stopwords.words('english'))
self.custom_stop_words = {"http", "https", "www", "com", "html", "jpg", "png"}
self.stop_words.update(self.custom_stop_words)
def remove_duplicates(self, texts: List[str]) -> Tuple[List[str], int]:
"""Remove duplicate texts using hashes"""
seen = set()
unique_texts = []
duplicates = 0
for text in texts:
text_normalized = text.strip().lower()
text_hash = hashlib.md5(text_normalized.encode()).hexdigest()
if text_hash not in seen:
seen.add(text_hash)
unique_texts.append(text)
else:
duplicates += 1
return unique_texts, duplicates
def filter_short_texts(self, texts: List[str], min_length: int = 10) -> Tuple[List[str], int]:
"""Filter out excessively short texts"""
filtered, removed = [], 0
for text in texts:
if len(text.strip()) >= min_length:
filtered.append(text)
else:
removed += 1
return filtered, removed
def clean_special_characters(self, text: str) -> str:
"""Remove URLs, HTML tags, noisy symbols, and extra spaces"""
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'<.*?>', '', text)
text = re.sub(r'[^\w\s.,!?\'\"-]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def normalize_case(self, text: str, case: str = 'lower') -> str:
"""Normalize case (usually lowercase for English)"""
if case == 'lower':
return text.lower()
if case == 'upper':
return text.upper()
return text
def filter_low_quality_texts(self, texts: List[str], quality_threshold: float = 0.3) -> Tuple[List[str], int]:
"""Filter low-quality texts using non-punctuation ratio"""
filtered, removed = [], 0
for text in texts:
if not text:
removed += 1
continue
total_chars = len(text)
punctuation_chars = sum(1 for c in text if c in string.punctuation)
non_punct_ratio = (total_chars - punctuation_chars) / total_chars
if non_punct_ratio >= quality_threshold:
filtered.append(text)
else:
removed += 1
return filtered, removed
def detect_language(self, text: str) -> str:
"""Simple language detection (character-set based)"""
if re.search(r'[\u4e00-\u9fff]', text):
return 'zh'
if re.search(r'[a-zA-Z]', text):
return 'en'
return 'other'
def process_batch(self, texts: List[str], min_length: int = 10, quality_threshold: float = 0.3) -> Tuple[List[str], dict]:
"""Full batch processing pipeline"""
stats = {
'original_count': len(texts),
'duplicates_removed': 0,
'short_texts_removed': 0,
'low_quality_removed': 0,
'other_removed': 0,
'final_count': 0
}
unique_texts, duplicates = self.remove_duplicates(texts)
stats['duplicates_removed'] = duplicates
filtered_length, short_removed = self.filter_short_texts(unique_texts, min_length)
stats['short_texts_removed'] = short_removed
cleaned = []
for text in filtered_length:
text_clean = self.clean_special_characters(text)
if self.detect_language(text_clean) == 'en':
text_clean = self.normalize_case(text_clean, 'lower')
cleaned.append(text_clean)
high_quality, low_quality_removed = self.filter_low_quality_texts(cleaned, quality_threshold)
stats['low_quality_removed'] = low_quality_removed
stats['final_count'] = len(high_quality)
stats['other_removed'] = stats['original_count'] - stats['final_count'] - (
stats['duplicates_removed'] + stats['short_texts_removed'] + stats['low_quality_removed']
)
return high_quality, stats
if __name__ == "__main__":
raw_texts = [
"Hello world! This is a sample text for LLM training. ",
"Hello world! This is a sample text for LLM training. ",
"Bad text!!!???",
"Short.",
"https://example.com - Check this website!",
"<p>HTML tagged text</p>",
"中文文本示例,测试多语言处理。",
"Another example with multiple spaces and special chars: @#$%"
]
cleaner = LLMDataCleaner()
cleaned_texts, stats = cleaner.process_batch(raw_texts, min_length=8, quality_threshold=0.5)
print("Cleaning statistics:")
for k, v in stats.items():
print(f"{k}: {v}")
print("\nCleaned texts:")
for i, t in enumerate(cleaned_texts, 1):
print(f"{i}. {t}")
Advanced LLM training data cleaning: privacy, multilingual noise, and chunking
For real-world pre-training corpora, you often need additional steps:
Desensitization for sensitive data
Redact phone numbers, emails, ID cards, and other personal identifiers. Otherwise, if training data contains private details, models may leak that information later.
Multilingual noise cleaning
Mixed-language content is common. For example, Chinese sentences may include short English fragments. Keep meaningful terms and remove low-value noise.
Split long text into semantic chunks
Long documents can exceed model input limits. The split_long_text approach helps keep chunks within a token limit while preserving semantic coherence.
Advanced pipeline example: desensitization + multilingual cleanup + semantic chunking
import re
import spacy
import fasttext
from typing import List, Tuple
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from langdetect import detect, LangDetectException
try:
nlp_en = spacy.load("en_core_web_lg")
nlp_zh = spacy.load("zh_core_web_lg")
except:
print("Warning: SpaCy models not found. Sensitive info detection may be limited.")
nlp_en = None
nlp_zh = None
try:
ft_model = fasttext.load_model('lid.176.bin')
except:
print("Warning: FastText model not found. Using fallback language detection.")
ft_model = None
quality_model_name = "microsoft/xtremedistil-l6-h384-uncased"
quality_tokenizer = AutoTokenizer.from_pretrained(quality_model_name)
quality_model = AutoModelForSequenceClassification.from_pretrained(quality_model_name, num_labels=2)
quality_pipeline = pipeline(
"text-classification",
model=quality_model,
tokenizer=quality_tokenizer,
device=0 if torch.cuda.is_available() else -1
)
class AdvancedLLMCleaner:
def __init__(self):
self.sensitive_patterns = {
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
"phone": r'\b(?:\+?86)?1[3-9]\d{9}\b',
"id_card": r'\b\d{17}[\dXx]\b',
"credit_card": r'\b(?:\d{4}[-\s]?){3}\d{4}\b'
}
self.harmful_keywords = {"violence", "discrimination", "hate", "terrorism"}
def detect_language_advanced(self, text: str) -> Tuple[str, float]:
if not text.strip():
return ("unknown", 0.0)
try:
if ft_model:
preds = ft_model.predict(text, k=1)
lang = preds[0][0].replace("__label__", "")
conf = preds[1][0]
return (lang, conf)
lang = detect(text)
return (lang, 0.8)
except (LangDetectException, IndexError):
return ("unknown", 0.0)
def split_long_text(self, text: str, lang: str = "en", max_tokens: int = 512) -> List[str]:
nlp = nlp_en if lang == "en" else nlp_zh if lang == "zh" else None
if not nlp or not text.strip():
return [text]
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]
chunks, current, current_len = [], [], 0
for sent in sentences:
sent_tokens = len(nlp(sent))
if current_len + sent_tokens <= max_tokens:
current.append(sent)
current_len += sent_tokens
else:
if current:
chunks.append(" ".join(current))
current = [sent]
current_len = sent_tokens
if current:
chunks.append(" ".join(current))
return chunks
def filter_semantic_quality(self, texts: List[str], threshold: float = 0.7) -> List[str]:
high_quality = []
for i in range(0, len(texts), 32):
batch = texts[i:i+32]
results = quality_pipeline(batch)
for text, res in zip(batch, results):
if res["label"] == "LABEL_1" and res["score"] >= threshold:
high_quality.append(text)
return high_quality
def desensitize_text(self, text: str, lang: str = "en") -> str:
for name, pattern in self.sensitive_patterns.items():
text = re.sub(pattern, f"[{name}_REDACTED]", text)
nlp = nlp_en if lang == "en" else nlp_zh if lang == "zh" else None
if nlp:
doc = nlp(text)
for ent in doc.ents:
if ent.label_ in ["PERSON", "GPE", "ORG", "DATE"]:
text = text.replace(ent.text, f"[{ent.label_}_REDACTED]")
for keyword in self.harmful_keywords:
if keyword in text.lower():
return ""
return text
def remove_cross_lang_noise(self, text: str, primary_lang: str = None) -> str:
if not primary_lang:
primary_lang, _ = self.detect_language_advanced(text)
if primary_lang == "unknown":
return text
if primary_lang == "zh":
english_words = re.findall(r'[A-Za-z]+', text)
for word in english_words:
if len(word) < 3 and word.lower() not in {"ai", "it", "gdp"}:
text = text.replace(word, "")
return text
if __name__ == "__main__":
cleaner = AdvancedLLMCleaner()
long_text = """
Dr. John Smith ([email protected]) delivered a speech on AI in Beijing.
He mentioned that 80% of data scientists use Python. His phone number is 13800138000.
这是一段包含英文单词的中文文本,其中夹杂着一些 short 英文单词。
"""
lang, _ = cleaner.detect_language_advanced(long_text)
desensitized = cleaner.desensitize_text(long_text, lang)
filtered = cleaner.remove_cross_lang_noise(desensitized, lang)
chunks = cleaner.split_long_text(filtered, lang)
print("Processed chunks:")
for i, chunk in enumerate(chunks, 1):
print(f"Chunk {i}: {chunk}")