Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 61 additions & 57 deletions apps/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,25 @@ def _is_lower_is_better(name: str) -> bool:
}


_RE_NUMBER = re.compile(r"(\d+)")


def split_numbers(s: str) -> list[str | int]:
"""Split a string into a list of parts, with numbers converted to `int`.

This function may be used as the `key` to sort strings in a natural order similar to files in macOS's Finder.

Args:
s: A string that may contain numbers. For example, `"$42k"`.

Returns:
A list of parts, with numbers converted to `int`. For example, `["$", 42, "k"]`.
"""
parts = _RE_NUMBER.split(s)
parts[1::2] = map(int, parts[1::2])
return parts


# ============================================================================
# Data Loading
# ============================================================================
Expand Down Expand Up @@ -182,7 +201,7 @@ def get_record_directories(run_dir: Path) -> list[Path]:
records_dir = run_dir / "records"
if not records_dir.exists():
return []
return sorted([d for d in records_dir.iterdir() if d.is_dir()], key=lambda d: d.name)
return sorted([d for d in records_dir.iterdir() if d.is_dir()], key=lambda d: split_numbers(d.name))


def load_record_result(record_dir: Path) -> ConversationResult | None:
Expand Down Expand Up @@ -478,7 +497,7 @@ def _get_record_data_dirs(record_dir: Path) -> list[tuple[str, Path]]:
for d in record_dir.iterdir()
if d.is_dir() and any(f for f in d.iterdir() if f.suffix in (".json", ".wav", ".jsonl"))
],
key=lambda d: d.name,
key=lambda d: split_numbers(d.name),
)
if trial_dirs:
return [(d.name, d) for d in trial_dirs]
Expand Down Expand Up @@ -1873,9 +1892,9 @@ def render_metrics_tab(metrics: RecordMetrics | None):
if metric_score.normalized_score is not None
else f"**{metric_name}**"
):
col1, col2 = st.columns([1, 3])
cols = iter(st.columns([1, 3]))

with col1:
with next(cols):
st.metric("Score", f"{metric_score.score:.3f}" if metric_score.score is not None else "N/A")
st.metric(
"Normalized",
Expand All @@ -1898,7 +1917,7 @@ def render_metrics_tab(metrics: RecordMetrics | None):
prefix = "⚠ " if flagged else ""
st.markdown(f"{prefix}**{label}:** {score_str}")

with col2:
with next(cols):
if metric_score.details:
st.markdown("**Details:**")
if "explanation" in metric_score.details:
Expand Down Expand Up @@ -1978,29 +1997,29 @@ def render_processed_data_tab(metrics: RecordMetrics | None):
st.info("No tool responses data")

# Transcripts by speaker
col1, col2 = st.columns(2)
with col1:
cols = iter(st.columns(2))
with next(cols):
with st.expander("Assistant Transcript (by Turn)"):
if context.get("transcribed_assistant_turns"):
st.json(context["transcribed_assistant_turns"])
else:
st.info("No assistant transcript data")
with col2:
with next(cols):
with st.expander("User Transcript (by Turn)"):
if context.get("transcribed_user_turns"):
st.json(context["transcribed_user_turns"])
else:
st.info("No user transcript data")

# TTS text
col1, col2 = st.columns(2)
with col1:
cols = iter(st.columns(2))
with next(cols):
with st.expander("Assistant TTS Text (by Turn)"):
if context.get("intended_assistant_turns"):
st.json(context["intended_assistant_turns"])
else:
st.info("No assistant TTS text data")
with col2:
with next(cols):
with st.expander("User TTS Text (by Turn)"):
if context.get("intended_user_turns"):
st.json(context["intended_user_turns"])
Expand Down Expand Up @@ -2506,15 +2525,15 @@ def render_record_detail(selected_run_dir: Path):

# Result summary
if result:
col1, col2, col3 = st.columns(3)
with col1:
cols = iter(st.columns(3))
with next(cols):
if result.completed:
st.success(f"Completed ({result.conversation_ended_reason or 'ok'})")
else:
st.warning(f"Failed: {result.error or 'unknown'}")
with col2:
with next(cols):
st.metric("Duration", f"{result.duration_seconds:.1f}s")
with col3:
with next(cols):
st.metric("Turns", result.num_turns)

# Audio player
Expand All @@ -2523,10 +2542,10 @@ def render_record_detail(selected_run_dir: Path):
if audio_path.exists() or el_audio_path.exists():
st.markdown("### Audio Recording")
if audio_path.exists():
st.audio(str(audio_path))
st.audio(audio_path)
if el_audio_path.exists():
st.caption("ElevenLabs recording")
st.audio(str(el_audio_path))
st.audio(el_audio_path)

# User goal & ground truth
with st.expander("User Goal", expanded=False):
Expand All @@ -2550,61 +2569,46 @@ def render_record_detail(selected_run_dir: Path):
preload_audio_data(selected_record_dir)

# Tabs
tab1, tab2, tab3, tab4, tab5 = st.tabs(
[
"Conversation Trace",
"Transcript",
"Metrics Detail",
"Processed Data",
"Turn Taking Analysis",
]
tab_names = (
"Conversation Trace",
"Transcript",
"Metrics Detail",
"Processed Data",
"Turn Taking Analysis",
)
tabs = iter(st.tabs(tab_names))

with tab1:
with next(tabs):
render_conversation_trace_tab(metrics, selected_record_dir)

with tab2:
with next(tabs):
st.markdown("### Transcript")
transcript_df = None
if metrics and metrics.context and "turns_transcript" in metrics.context:
try:
turns = metrics.context["turns_transcript"]
if turns:
transcript_df = pd.DataFrame(turns)
column_config = {}
if "content" in transcript_df.columns:
column_config["content"] = st.column_config.TextColumn("content", width="large")
if "timestamp" in transcript_df.columns:
column_config["timestamp"] = st.column_config.TextColumn("timestamp", width="small")
if "role" in transcript_df.columns:
column_config["role"] = st.column_config.TextColumn("role", width="small")
st.dataframe(transcript_df, hide_index=True, column_config=column_config)
else:
st.info("No transcript data available")
transcript_df = pd.DataFrame(turns) if turns else pd.DataFrame()
except Exception:
transcript_df = format_transcript(selected_record_dir / "transcript.jsonl")
if not transcript_df.empty:
st.dataframe(transcript_df, hide_index=True)
else:
transcript_df = None # fall back to transcript.jsonl
if transcript_df is None:
transcript_df = format_transcript(selected_record_dir / "transcript.jsonl")
if not transcript_df.empty:
column_config = {}
if "content" in transcript_df.columns:
column_config["content"] = st.column_config.TextColumn("content", width="large")
if "timestamp" in transcript_df.columns:
column_config["timestamp"] = st.column_config.TextColumn("timestamp", width="small")
if "role" in transcript_df.columns:
column_config["role"] = st.column_config.TextColumn("role", width="small")
st.dataframe(transcript_df, hide_index=True, column_config=column_config)
else:
st.info("No transcript data available")
if transcript_df.empty:
st.info("No transcript data available")
else:
transcript_df = transcript_df.set_index(["timestamp", "role"])
column_config = {
"timestamp": st.column_config.DatetimeColumn(format="YYYY-MM-DD HH:mm:ss.SSS"),
"role": st.column_config.MultiselectColumn(options=("user", "assistant"), color=("purple", "blue")),
}
st.dataframe(transcript_df, column_config=column_config)

with tab3:
with next(tabs):
render_metrics_tab(metrics)

with tab4:
with next(tabs):
render_processed_data_tab(metrics)

with tab5:
with next(tabs):
render_audio_analysis_tab(selected_record_dir)


Expand Down
2 changes: 1 addition & 1 deletion src/eva/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@

# Bump metrics_version when changes affect metric computation (metrics code,
# judge prompts, pricing tables, postprocessor).
metrics_version = "2.1.1"
metrics_version = "2.1.2"
23 changes: 9 additions & 14 deletions src/eva/metrics/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,27 +701,22 @@ def _label_trailing_assistant_turn(context: "_ProcessorContext", last_entry: dic
if trailing_turn_id is None:
return

text = (
context.conversation_trace[-1]["content"]
if last_entry.get("role") == "assistant"
else context.intended_assistant_turns[trailing_turn_id]
)
labeled = f"{text} {AnnotationLabel.CUT_OFF_ON_ITS_OWN}"

if last_entry.get("role") == "assistant":
context.conversation_trace[-1]["content"] = labeled
context.conversation_trace[-1]["content"] += f" {AnnotationLabel.CUT_OFF_ON_ITS_OWN}"
else:
labeled = f"{context.intended_assistant_turns[trailing_turn_id]} {AnnotationLabel.CUT_OFF_ON_ITS_OWN}"
context.conversation_trace.append(
{"role": "assistant", "content": labeled, "type": "intended", "turn_id": trailing_turn_id}
)

# Sync intended + transcribed (skip intended for S2S — no intended text exists)
if context.pipeline_type != PipelineType.S2S:
context.intended_assistant_turns[trailing_turn_id] = labeled
if not context.transcribed_assistant_turns.get(trailing_turn_id):
context.transcribed_assistant_turns[trailing_turn_id] = labeled
else:
# Append the label to the aggregated turn text (skip intended for S2S — no intended text exists).
if context.intended_assistant_turns.get(trailing_turn_id) and context.pipeline_type != PipelineType.S2S:
context.intended_assistant_turns[trailing_turn_id] += f" {AnnotationLabel.CUT_OFF_ON_ITS_OWN}"
if context.transcribed_assistant_turns.get(trailing_turn_id):
context.transcribed_assistant_turns[trailing_turn_id] += f" {AnnotationLabel.CUT_OFF_ON_ITS_OWN}"
else:
# STT produced no text for the final turn — back from the (already-labeled) intended text.
context.transcribed_assistant_turns[trailing_turn_id] = context.intended_assistant_turns.get(trailing_turn_id)

logger.info(f"Record {context.record_id}: Labeled trailing assistant at turn {trailing_turn_id}")

Expand Down
Loading