Permanently modify model weights to bake in steering behaviors. Unlike runtime activation steering (hooks), weight modifications are permanent and the exported model no longer needs steering vectors at inference time.
Wisent provides three main approaches to permanent weight modification:
Decomposes weights into magnitude and direction, projects only the direction while preserving original magnitudes. Maintains model intelligence and reasoning capabilities. Can add or remove behaviors.
Adds bias toward steering direction directly in weights. More conservative approach that preserves capabilities better. Equivalent to pre-computing the steering.
This equation is used for reducing dimensions along the direction of the steering vector; however, this removal is not advised because it alters weight norms and might reduce model performance. To Remove Dimensions Parallel to Steering Vector Do Not Recommend As It
from wisent.core.weight_modification import (
# Norm-Preserving Directional Modification (RECOMMENDED)
project_weights,
project_weights_norm_preserved,
project_weights_multi_direction,
project_weights_titan,
# Additive (bake steering into weights)
bake_steering_into_weights,
# Export
export_modified_model,
save_modified_weights,
compare_models,
# Multi-direction methods
train_and_bake_titan,
train_and_bake_prism,
train_and_bake_pulse,
# Guided modification
run_guided_modification,
# Multi-concept
run_multi_concept_modification,
)This method works best when you need to eliminate problematic behaviors such as the model refusing too often, all while keeping its core capabilities intact.
from wisent.core.weight_modification import (
project_weights,
export_modified_model,
)
# Load your model
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3-8B-Instruct")
# steering_vectors is a dict of layer_name -> tensor
# These come from contrastive pair training
project_weights(
model,
steering_vectors,
strength=1.0, # 1.0 = full projection
norm_preserve=True, # Default, maintains weight magnitudes
)
# Export the modified model
export_modified_model(model, "path/to/abliterated-model")from wisent.core.weight_modification import project_weights_norm_preserved
# Biprojection: remove harmful direction while preserving harmless
project_weights_norm_preserved(
model,
steering_vectors, # Directions to remove
harmless_vectors=harmless_vectors, # Directions to preserve
strength=1.0,
)Instead of deleting instructions, incorporate direct steering into weights; useful for strengthening performance.
from wisent.core.weight_modification import bake_steering_into_weights
# Add the steering direction permanently to weights
bake_steering_into_weights(
model,
steering_vectors,
alpha=1.0, # Steering strength
)
# The model now behaves as if steering is always appliedAdvanced methods for handling multiple steering directions simultaneously.
Training-free multi-direction steering baked into weights.
Multi-directional projection with interference handling.
Progressive multi-direction with quality monitoring.
from wisent.core.weight_modification import (
MultiDirectionConfig,
train_and_bake_titan,
train_and_bake_prism,
)
config = MultiDirectionConfig(
directions={
"helpfulness": helpfulness_vectors,
"honesty": honesty_vectors,
"safety": safety_vectors,
},
strengths={
"helpfulness": 1.0,
"honesty": 0.8,
"safety": 1.2,
}
)
# TITAN approach
result = train_and_bake_titan(model, config)
# Or PRISM approach
result = train_and_bake_prism(model, config)Modification guided by linearity which automatically picks best layers and inspects incidental harm.
from wisent.core.weight_modification import (
GuidedModificationConfig,
run_guided_modification,
compute_layer_diagnostics,
)
# Analyze which layers to modify
diagnostics = compute_layer_diagnostics(
model,
steering_vectors,
eval_dataset
)
config = GuidedModificationConfig(
steering_vectors=steering_vectors,
max_collateral_damage=0.05, # 5% max capability loss
surgical_mode=True, # Only modify most effective layers
)
result = run_guided_modification(model, config)
# Check the collateral damage report
print(result.collateral_damage_report)Handle multiple concepts with different actions (add, remove, or modify).
from wisent.core.weight_modification import (
MultiConceptConfig,
ConceptSpec,
ConceptAction,
run_multi_concept_modification,
)
config = MultiConceptConfig(
concepts=[
ConceptSpec(
name="refusal",
vectors=refusal_vectors,
action=ConceptAction.REMOVE,
strength=0.8
),
ConceptSpec(
name="helpfulness",
vectors=helpfulness_vectors,
action=ConceptAction.ADD,
strength=1.0
),
],
orthogonalize=True, # Prevent interference
)
result = run_multi_concept_modification(model, config)# Basic abliteration python -m wisent modify-weights \ --model meta-llama/Llama-3-8B-Instruct \ --vectors refusal_vectors.pt \ --method norm-preserved \ --strength 1.0 \ --output ./abliterated-model # With optimization python -m wisent optimize-weights \ --model meta-llama/Llama-3-8B-Instruct \ --task refusal-reduction \ --eval-task mmlu \ --max-collateral-damage 0.05 \ --output ./optimized-model
from wisent.core.weight_modification import (
export_modified_model,
save_modified_weights,
compare_models,
)
# Full model export (HuggingFace format)
export_modified_model(
model,
"path/to/modified-model",
save_tokenizer=True,
)
# Compare before/after
comparison = compare_models(
original_model,
modified_model,
eval_prompts=["Hello!", "What is 2+2?", "Write a poem"]
)
print(comparison.capability_retention)
print(comparison.behavior_change)Stay in the loop. Never miss out.
Subscribe to our newsletter and unlock Wisent insights.