Goal: For every video in a Google Drive folder, extract body keypoints (COCO-17) for each person per frame on GPU, save per‑video CSVs, and generate annotated videos.
This mirrors the MediaPipe notebook’s structure but uses a CUDA-native pipeline that works reliably on Colab GPUs. This version fixes occasional IndexError when a frame/person returns no keypoints or fewer than 17 keypoints.
1) Runtime & GPU Check¶
Instruction: Enable GPU in Colab → Runtime → Change runtime type → Hardware accelerator: GPU.
# DO NOT CHANGE ANYTHING IN THIS CELL
!nvidia-smi || echo "nvidia-smi not available (GPU runtime not enabled?)"
import torch, platform
print("PyTorch CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU name:", torch.cuda.get_device_name(0))
print("Python:", platform.python_version())
2) Install Dependencies¶
# DO NOT CHANGE ANYTHING IN THIS CELL
%pip -q install ultralytics==8.3.30 opencv-python-headless pandas numpy tqdm pyarrow
import ultralytics
from ultralytics import YOLO
print("Ultralytics version:", ultralytics.__version__)
3) Imports¶
# DO NOT CHANGE ANYTHING IN THIS CELL
import os, sys, cv2, math, time, pathlib, json, glob
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from ultralytics import YOLO
4) Set Input Folder (Drive) and Output Folder¶
Edit one line: INPUT_DIR — the Drive folder that contains your videos.
All outputs (CSVs + annotated videos) will be written to OUTPUT_DIR.
# ===============================================================
# Google Drive ONLY — Batch all videos in a folder
# ===============================================================
from google.colab import drive
drive.mount('/content/drive')
# ---- STUDENTS: EDIT THIS ONLY ----
INPUT_DIR = "/content/drive/MyDrive/LS100_videos" # <--- folder containing your .mp4/.mov/.avi/.mkv files
OUTPUT_DIR = "/content/drive/MyDrive/LS100_outputs" # <--- outputs will be saved here
# ----------------------------------
import os, glob
os.makedirs(OUTPUT_DIR, exist_ok=True)
VIDEO_EXTS = (".mp4", ".mov", ".avi", ".mkv")
def list_videos(folder):
files = []
for ext in VIDEO_EXTS:
files.extend(glob.glob(os.path.join(folder, f"*{ext}")))
files.extend(glob.glob(os.path.join(folder, f"*{ext.upper()}")))
return sorted(files)
video_files = list_videos(INPUT_DIR)
print("Found the following videos:")
for v in video_files:
print(" •", v)
if len(video_files) == 0:
raise FileNotFoundError(
f"❌ No video files found in:\n{INPUT_DIR}\n"
"Make sure your videos are inside that folder and use .mp4/.mov/.avi/.mkv formats."
)
else:
print(f"\n✅ Total videos to process: {len(video_files)}")
5) Choose Pose Model Variant¶
Pick a model for speed/accuracy trade‑off:
yolov8n-pose.pt→ nano (fastest)yolov8s-pose.pt→ small (balanced) ← defaultLarger models:
yolov8m/l/x-pose.pt(slower, higher accuracy)
# NOTE: You may change ONLY the model filename below.
MODEL_WEIGHTS = "yolov8s-pose.pt" # try: 'yolov8n-pose.pt' for faster runs
# DO NOT CHANGE ANYTHING BELOW
assert torch.cuda.is_available(), "GPU not available. Enable GPU runtime in Colab."
model = YOLO(MODEL_WEIGHTS).to('cuda')
print("Loaded model on CUDA:", MODEL_WEIGHTS)
6) COCO Keypoint Names (17)¶
We export COCO‑17 keypoints per person (x, y, conf).
# DO NOT CHANGE ANYTHING IN THIS CELL
COCO_KP_NAMES = [
"nose","left_eye","right_eye","left_ear","right_ear",
"left_shoulder","right_shoulder","left_elbow","right_elbow",
"left_wrist","right_wrist","left_hip","right_hip",
"left_knee","right_knee","left_ankle","right_ankle"
]
NUM_KP = len(COCO_KP_NAMES)
7) Helper — Video Metadata¶
# DO NOT CHANGE ANYTHING IN THIS CELL
def get_video_fps_frames(path):
cap = cv2.VideoCapture(path)
if not cap.isOpened():
raise RuntimeError(f"Cannot open video: {path}")
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
cap.release()
return fps, frames
8) Core Function — Track & Export Keypoints to CSV¶
Uses ByteTrack for stable
track_id.Saves annotated video under
OUTPUT_DIR/<run_name>/...Saves CSV under
OUTPUT_DIR/<run_name>_keypoints.csvThe pipeline supports Robust Export (handles missing/partial keypoints) - guarding against frames/people that return zero or partial keypoints by filling
Nonevalues instead of indexing into an empty array.
# DO NOT CHANGE ANYTHING IN THIS CELL
def process_video_to_csv(
input_video: str,
output_dir: str,
model: YOLO,
conf: float = 0.25,
iou: float = 0.5,
imgsz: int = 640,
tracker: str = "bytetrack.yaml",
run_name: str = "run1",
) -> dict:
"""Run YOLOv8-Pose tracking on a video and export keypoints to CSV.
Returns: {'csv_path', 'annotated_video_dir', 'annotated_video'}
"""
os.makedirs(output_dir, exist_ok=True)
fps, total_frames = get_video_fps_frames(input_video)
print(f"Video FPS: {fps:.2f}, frames: {total_frames}")
# Ensure ByteTrack dep is active (Ultralytics auto-installs lapx if needed; reruns handle it)
results = model.track(
source=input_video,
conf=conf,
iou=iou,
imgsz=imgsz,
device=0,
verbose=False,
stream=False, # keep False for auto-saved annotated video
save=True,
project=output_dir,
name=run_name,
tracker=tracker,
persist=True
)
if not isinstance(results, list):
results = list(results)
rows = []
for frame_idx, r in enumerate(tqdm(results, desc="Extracting keypoints")):
ts_ms = int(round(frame_idx * 1000.0 / max(fps, 1e-6)))
# Skip if no keypoints
if r.keypoints is None or r.keypoints.xy is None or len(r.keypoints.xy) == 0:
continue
kps_xy = r.keypoints.xy # shape: [num_people, K, 2] (K may be 0 or < 17 sometimes)
kps_conf = getattr(r.keypoints, 'conf', None)
boxes = r.boxes
# Track IDs (optional; may be missing)
ids = None
if boxes is not None and getattr(boxes, 'id', None) is not None:
try:
ids = boxes.id.detach().cpu().numpy().astype(int)
except Exception:
ids = None
# Detection confidences
det_conf = None
if boxes is not None and getattr(boxes, 'conf', None) is not None:
try:
det_conf = boxes.conf.detach().cpu().numpy()
except Exception:
det_conf = None
num_people = kps_xy.shape[0]
for i in range(num_people):
# Pull per-person tensors/arrays
kpi = kps_xy[i]
if hasattr(kpi, 'detach'):
kpi = kpi.detach().cpu().numpy() # shape: [K, 2]
kpci = None
if kps_conf is not None:
kpci = kps_conf[i]
if hasattr(kpci, 'detach'):
kpci = kpci.detach().cpu().numpy() # shape: [K]
K = kpi.shape[0] if kpi is not None and hasattr(kpi, 'shape') else 0
track_id = int(ids[i]) if ids is not None and i < len(ids) else -1
this_det_conf = float(det_conf[i]) if det_conf is not None and i < len(det_conf) else None
row = {
"video_file": os.path.basename(input_video),
"frame_idx": frame_idx,
"timestamp_ms": ts_ms,
"track_id": track_id,
"det_conf": this_det_conf
}
# Fill keypoints, guarding for K < 17 (or K == 0)
for kp_idx, kp_name in enumerate(COCO_KP_NAMES):
if K > kp_idx:
x = float(kpi[kp_idx, 0])
y = float(kpi[kp_idx, 1])
row[f"{kp_name}_x"] = x
row[f"{kp_name}_y"] = y
if kpci is not None and kp_idx < len(kpci):
row[f"{kp_name}_conf"] = float(kpci[kp_idx])
else:
row[f"{kp_name}_conf"] = None
else:
row[f"{kp_name}_x"] = None
row[f"{kp_name}_y"] = None
row[f"{kp_name}_conf"] = None
rows.append(row)
df = pd.DataFrame(rows)
csv_path = os.path.join(output_dir, f"{run_name}_keypoints.csv")
df.to_csv(csv_path, index=False)
print(f"Saved CSV: {csv_path} ({len(df):,} rows)")
ann_dir = os.path.join(output_dir, run_name)
ann_video = None
if os.path.isdir(ann_dir):
for f in os.listdir(ann_dir):
if f.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
ann_video = os.path.join(ann_dir, f); break
return {"csv_path": csv_path, "annotated_video_dir": ann_dir if os.path.isdir(ann_dir) else None, "annotated_video": ann_video}
9) Batch Run — Process Every Video in the Folder¶
Uses the video filename (without extension) as
run_name.Produces one CSV and an annotated video per input file.
# STUDENTS: You may change CONF/IOU/IMG_SIZE for experiments.
CONF = 0.25 # min detection confidence (higher = fewer, more precise detections)
IOU = 0.5 # NMS overlap threshold (higher = fewer merges; lower = more aggressive suppression)
IMG_SIZE = 640 # network input size (bigger = slower but potentially more accurate)
all_outputs = []
for video_path in video_files:
run_name = os.path.splitext(os.path.basename(video_path))[0]
print(f"\n==============================")
print(f"🔍 Processing video: {video_path}")
print(f"Run name: {run_name}")
print("==============================\n")
out = process_video_to_csv(
input_video=video_path,
output_dir=OUTPUT_DIR,
model=model,
conf=CONF,
iou=IOU,
imgsz=IMG_SIZE,
tracker="bytetrack.yaml",
run_name=run_name
)
all_outputs.append(out)
print("\n🎉 ALL VIDEOS COMPLETED!")
print("Summary of outputs:")
for o in all_outputs:
print(" • CSV:", o["csv_path"], "| Annotated video:", o["annotated_video"])
10) Preview the First CSV¶
# DO NOT CHANGE ANYTHING IN THIS CELL
first_csv = next((o["csv_path"] for o in all_outputs if o["csv_path"]), None)
if first_csv and os.path.exists(first_csv):
df = pd.read_csv(first_csv)
df.head()
else:
print("No CSVs found to preview.")
11) Optional — Merge All CSVs¶
# Optional: create a single merged CSV for all videos
merged_rows = []
for o in all_outputs:
csvp = o.get("csv_path")
if csvp and os.path.exists(csvp):
df = pd.read_csv(csvp)
if "video_file" not in df.columns:
df["video_file"] = os.path.basename(csvp).replace("_keypoints.csv","")
merged_rows.append(df)
if merged_rows:
merged = pd.concat(merged_rows, axis=0, ignore_index=True)
merged_csv = os.path.join(OUTPUT_DIR, "ALL_videos_keypoints_merged.csv")
merged.to_csv(merged_csv, index=False)
print("Saved merged CSV:", merged_csv, f"({len(merged):,} rows)")
merged.head()
else:
print("No per-video CSVs were found; nothing to merge.")
12) Tips¶
Speed vs. Accuracy: Prefer
yolov8n-pose.ptfor faster class demos; useyolov8s-pose.ptfor better accuracy.Tracking IDs:
track_idlets you follow the same person across frames.-1means no stable ID was assigned.Coordinates: Keypoints are in image pixel space after model preprocessing (letterboxing). For original-space mapping, you’ll need to unletterbox (advanced).
13) Parameter Tuning — What do CONF, IOU, IMG_SIZE do?¶
CONF(confidence threshold): Minimum detection confidence to accept a person before pose is estimated.Higher (e.g.,
0.5) → fewer detections, more precise.Lower (e.g.,
0.2) → more detections, may include false positives.
IOU(Intersection-over-Union for Non-Max Suppression): Overlap threshold for merging/suppressing detections.Higher (e.g.,
0.7) keeps more overlapping detections (risk of duplicates).Lower (e.g.,
0.4) suppresses overlaps more aggressively.
IMG_SIZE(model input size): The size images are resized to before inference.Larger (e.g.,
800) → slower, can help small/complex poses.Smaller (e.g.,
512) → faster, may miss small details.