Wisent supports steering across multiple modalities beyond text, including audio, video, and robotics. The same contrastive pair approach works across all modalities.
Steer language models for helpfulness, safety, creativity, and more.
Modify speech recognition and audio generation models.
Apply steering to video understanding and generation models.
Steer policy networks for safer, gentler robot actions.
Wisent provides typed content wrappers for each modality:
from wisent import (
# Text
TextContent,
# Audio
AudioContent,
# Visual
VideoContent,
ImageContent,
# Robotics
RobotState,
RobotAction,
RobotTrajectory,
# Combined
MultimodalContent,
)from wisent import Wisent, AudioContent
# Create Wisent for audio models
wisent = Wisent.for_audio("openai/whisper-large-v3")
# Add contrastive pairs using audio files
wisent.add_pair(
positive=AudioContent.from_file("calm_speech.wav"),
negative=AudioContent.from_file("angry_speech.wav"),
trait="calmness"
)
# Train steering vectors
wisent.train()
# Transcribe with steering applied
audio_input = AudioContent.from_file("input.wav")
transcript = wisent.transcribe(
audio_input,
steer={"calmness": 1.2}
)from wisent import Wisent, VideoContent
# Create Wisent for video models
wisent = Wisent.for_video("MCG-NJU/videomae-base")
# Add contrastive pairs using video files
wisent.add_pair(
positive=VideoContent.from_file("safe_action.mp4"),
negative=VideoContent.from_file("unsafe_action.mp4"),
trait="safety"
)
# Train and generate
wisent.train()
video_input = VideoContent.from_file("input_video.mp4")
embedding = wisent.generate(
video_input,
steer={"safety": 2.0}
)from wisent import Wisent, RobotState, RobotTrajectory
import torch
# Your policy network
policy_network = torch.nn.Module(...) # Your trained policy
# Create Wisent for robotics
wisent = Wisent.for_robotics(model=policy_network)
# Define gentle vs forceful trajectories
gentle_trajectory = RobotTrajectory(...)
forceful_trajectory = RobotTrajectory(...)
# Add contrastive pairs
wisent.add_pair(
positive=gentle_trajectory,
negative=forceful_trajectory,
trait="gentleness"
)
# Train steering vectors
wisent.train()
# Get steered actions from robot state
robot_state = RobotState(...)
action = wisent.act(
robot_state,
steer={"gentleness": 1.5}
)from wisent import Wisent, MultimodalContent, ImageContent, TextContent
# Create Wisent for multimodal models
wisent = Wisent.for_multimodal("llava-hf/llava-v1.6-mistral-7b-hf")
# Create multimodal contrastive pairs
positive = MultimodalContent(
image=ImageContent.from_file("safe_image.jpg"),
text=TextContent(text="Describe this image helpfully")
)
negative = MultimodalContent(
image=ImageContent.from_file("same_image.jpg"),
text=TextContent(text="Describe this image")
)
wisent.add_pair(
positive=positive,
negative=negative,
trait="helpfulness"
)
wisent.train()
# Generate with multimodal input
input_content = MultimodalContent(
image=ImageContent.from_file("query_image.jpg"),
text=TextContent(text="What's in this image?")
)
response = wisent.generate(
input_content,
steer={"helpfulness": 1.5}
)Underneath things work with specific adapter modules. They are also usable directly for more advanced tasks: Underneath they rely on specific adapter modules; these can be used directly as
from wisent import (
BaseAdapter, # Abstract base class
TextAdapter, # For LLMs
AudioAdapter, # For audio models
VideoAdapter, # For video models
RoboticsAdapter, # For policy networks
MultimodalAdapter, # For VLMs
)
# Direct adapter usage
from wisent import TextAdapter
adapter = TextAdapter(
model_name="meta-llama/Llama-3-8B-Instruct",
device="cuda"
)
# Extract activations manually
activations = adapter.extract_activations(
TextContent(text="Hello world"),
target_layers=["layer.15", "layer.16"]
)Stay in the loop. Never miss out.
Subscribe to our newsletter and unlock Wisent insights.