👁️ Visão e Percepção no Unitree G1

Objetivo do Módulo

Dominar o sistema de visão do G1: câmeras Intel RealSense D435i, processamento de depth, SLAM (mapeamento e localização), e object detection pipeline completo.

📷 Intel RealSense D435i

Especificações do Hardware

G1 possui 2x RealSense D435i:

Localização: Frontal (head), orientadas para frente
Field of View: 87° × 58° (H × V)
Depth range: 0.3m - 10m
Resolução RGB: 1920x1080 @ 30 fps (default: 640x480)
Resolução Depth: 1280x720 @ 30 fps
IMU integrado: 6-axis (200 Hz)

Acessando Câmeras

🔧 Via RealSense ROS2
🤖 Via Unitree SDK

Instalar:

sudo apt install ros-humble-realsense2-camera

Launch:

# Câmera esquerda
ros2 launch realsense2_camera rs_launch.py \
  camera_name:=camera_left \
  serial_no:='_123456789'  # Serial da câmera esquerda

# Câmera direita (outro terminal)
ros2 launch realsense2_camera rs_launch.py \
  camera_name:=camera_right \
  serial_no:='_987654321'  # Serial da câmera direita

Topics publicados:

ros2 topic list | grep camera_left
# /camera_left/color/image_raw
# /camera_left/depth/image_rect_raw
# /camera_left/infra1/image_rect_raw
# /camera_left/infra2/image_rect_raw
# /camera_left/aligned_depth_to_color/image_raw
# /camera_left/accel/sample
# /camera_left/gyro/sample

#!/usr/bin/env python3
"""
camera_stream.py - Streaming de câmeras via Unitree SDK
"""

from unitree_sdk2_python import G1Robot, Subscriber
import numpy as np
import cv2

class CameraStreamer:
    def __init__(self, robot: G1Robot):
        self.robot = robot

        # Subscribers para câmeras
        self.left_rgb_sub = Subscriber(
            "/camera_left/color/image_raw",
            callback=self.left_rgb_callback
        )
        self.left_depth_sub = Subscriber(
            "/camera_left/depth/image_rect_raw",
            callback=self.left_depth_callback
        )

    def left_rgb_callback(self, msg):
        """Processa imagem RGB"""
        # Converter DDS Image para numpy array
        img = np.frombuffer(msg.data, dtype=np.uint8).reshape(
            (msg.height, msg.width, 3)
        )

        # Converter RGB para BGR (OpenCV)
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        # Exibir
        cv2.imshow("Left Camera - RGB", img_bgr)
        cv2.waitKey(1)

    def left_depth_callback(self, msg):
        """Processa depth map"""
        # 16-bit unsigned (mm)
        depth = np.frombuffer(msg.data, dtype=np.uint16).reshape(
            (msg.height, msg.width)
        )

        # Converter para metros
        depth_m = depth.astype(np.float32) / 1000.0

        # Visualizar (normalizar para 0-255)
        depth_vis = cv2.normalize(depth_m, None, 0, 255,
                                   cv2.NORM_MINMAX, dtype=cv2.CV_8U)
        depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)

        cv2.imshow("Left Camera - Depth", depth_colored)
        cv2.waitKey(1)

    def run(self):
        """Roda streamers"""
        self.left_rgb_sub.spin()

def main():
    robot = G1Robot()
    robot.wait_for_connection()

    streamer = CameraStreamer(robot)
    streamer.run()

if __name__ == "__main__":
    main()

🗺️ Depth Processing

Point Cloud Generation

#!/usr/bin/env python3
"""
pointcloud_generator.py - Gera point clouds das câmeras
"""

import numpy as np
import open3d as o3d
from sensor_msgs.msg import Image, CameraInfo
import cv2

class PointCloudGenerator:
    def __init__(self):
        # Parâmetros intrínsecos da D435i (1280x720)
        self.fx = 640.0  # Focal length X
        self.fy = 640.0  # Focal length Y
        self.cx = 640.0  # Principal point X
        self.cy = 360.0  # Principal point Y

        self.width = 1280
        self.height = 720

    def depth_to_pointcloud(self, depth_image: np.ndarray,
                             rgb_image: np.ndarray = None) -> o3d.geometry.PointCloud:
        """
        Converte depth image para point cloud

        Args:
            depth_image: Depth map em metros (HxW)
            rgb_image: Opcional, imagem RGB (HxWx3)

        Returns:
            Open3D PointCloud
        """
        # Criar meshgrid de coordenadas de pixel
        u = np.arange(self.width)
        v = np.arange(self.height)
        u, v = np.meshgrid(u, v)

        # Backproject para 3D
        z = depth_image
        x = (u - self.cx) * z / self.fx
        y = (v - self.cy) * z / self.fy

        # Flatten e remover pontos inválidos (depth = 0)
        points = np.stack([x, y, z], axis=-1).reshape(-1, 3)
        valid = points[:, 2] > 0  # Z > 0

        points = points[valid]

        # Criar point cloud
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(points)

        # Adicionar cores se RGB fornecido
        if rgb_image is not None:
            colors = rgb_image.reshape(-1, 3)[valid] / 255.0
            pcd.colors = o3d.utility.Vector3dVector(colors)

        return pcd

    def visualize_pointcloud(self, pcd: o3d.geometry.PointCloud):
        """Visualiza point cloud interativo"""
        o3d.visualization.draw_geometries([pcd])

# Exemplo de uso:
def main():
    generator = PointCloudGenerator()

    # Simular depth (na prática, ler de ROS topic ou Unitree SDK)
    depth = np.random.rand(720, 1280) * 5.0  # 0-5 metros

    # Gerar point cloud
    pcd = generator.depth_to_pointcloud(depth)

    print(f"Point cloud: {len(pcd.points)} pontos")

    # Visualizar
    generator.visualize_pointcloud(pcd)

if __name__ == "__main__":
    main()

Obstacle Detection

class ObstacleDetector:
    def __init__(self, height_threshold=0.15):
        """
        Detecta obstáculos baseado em depth

        Args:
            height_threshold: Altura mínima para considerar obstáculo [m]
        """
        self.height_threshold = height_threshold

    def detect_obstacles(self, depth_image: np.ndarray) -> list:
        """
        Detecta obstáculos na imagem de profundidade

        Returns:
            Lista de (x, y, width, height) de obstáculos
        """
        # Threshold: pontos mais próximos que 2m
        close_points = (depth_image > 0.3) & (depth_image < 2.0)

        # Converter para uint8
        close_points_img = (close_points * 255).astype(np.uint8)

        # Encontrar contornos
        contours, _ = cv2.findContours(
            close_points_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )

        obstacles = []
        for contour in contours:
            area = cv2.contourArea(contour)

            # Filtrar ruído (área mínima)
            if area < 100:
                continue

            x, y, w, h = cv2.boundingRect(contour)
            obstacles.append((x, y, w, h))

        return obstacles

    def visualize_obstacles(self, rgb_image: np.ndarray,
                             obstacles: list) -> np.ndarray:
        """Desenha bounding boxes dos obstáculos"""
        img_vis = rgb_image.copy()

        for (x, y, w, h) in obstacles:
            cv2.rectangle(img_vis, (x, y), (x+w, y+h), (0, 0, 255), 2)
            cv2.putText(img_vis, "Obstacle", (x, y-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

        return img_vis

🗺️ SLAM (Simultaneous Localization and Mapping)

ORB-SLAM3 com RealSense

Instalar:

# ORB-SLAM3 dependencies
sudo apt install libeigen3-dev libopencv-dev libpangolin-dev

# Clonar ORB-SLAM3
cd ~
git clone https://github.com/UZ-SLAMLab/ORB_SLAM3.git
cd ORB_SLAM3

# Build
chmod +x build.sh
./build.sh

# Build ROS2 wrapper
cd ~/g1_ros2_ws/src
git clone https://github.com/thien94/orb_slam3_ros2.git
cd ~/g1_ros2_ws
colcon build --packages-select orb_slam3_ros2

Configuração para D435i:

# D435i.yaml - ORB-SLAM3 config
%YAML:1.0

Camera.type: "PinHole"

Camera.fx: 640.0
Camera.fy: 640.0
Camera.cx: 640.0
Camera.cy: 360.0

Camera.k1: 0.0
Camera.k2: 0.0
Camera.p1: 0.0
Camera.p2: 0.0

Camera.width: 1280
Camera.height: 720

Camera.fps: 30.0
Camera.RGB: 1

ORBextractor.nFeatures: 1200
ORBextractor.scaleFactor: 1.2
ORBextractor.nLevels: 8

Viewer.KeyFrameSize: 0.05
Viewer.KeyFrameLineWidth: 1.0

Launch:

ros2 run orb_slam3_ros2 stereo \
  --vocabulary ./Vocabulary/ORBvoc.txt \
  --settings ./Examples/Stereo/D435i.yaml

RTAB-Map (Alternative)

# Instalar RTAB-Map
sudo apt install ros-humble-rtabmap-ros

# Launch
ros2 launch rtabmap_launch rtabmap.launch.py \
  rtabmap_args:="--delete_db_on_start" \
  depth_topic:=/camera_left/aligned_depth_to_color/image_raw \
  rgb_topic:=/camera_left/color/image_raw \
  camera_info_topic:=/camera_left/color/camera_info \
  frame_id:=base_link \
  approx_sync:=true

Visualizar mapa:

ros2 run rviz2 rviz2
# Adicionar display: Map -> /rtabmap/mapData

🎯 Object Detection

YOLOv8 Integration

#!/usr/bin/env python3
"""
yolo_detector.py - Object detection com YOLOv8
"""

from ultralytics import YOLO
import cv2
import numpy as np
import rclpy
from rclpy.node import Node
from sensor_msgs.msg import Image
from vision_msgs.msg import Detection2DArray, Detection2D, ObjectHypothesisWithPose
from cv_bridge import CvBridge

class YOLODetectorNode(Node):
    def __init__(self):
        super().__init__('yolo_detector')

        # Carregar modelo YOLOv8
        self.model = YOLO('yolov8n.pt')  # nano (rápido)

        # Subscriber para câmera
        self.image_sub = self.create_subscription(
            Image, '/camera_left/color/image_raw',
            self.image_callback, 10
        )

        # Publisher para detecções
        self.detection_pub = self.create_publisher(
            Detection2DArray, '/detections', 10
        )

        self.bridge = CvBridge()

        self.get_logger().info('YOLOv8 Detector ready')

    def image_callback(self, msg: Image):
        """Processa imagem e detecta objetos"""
        # Converter ROS Image para OpenCV
        cv_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding='bgr8')

        # Detectar objetos
        results = self.model(cv_image, verbose=False)

        # Criar mensagem de detecções
        detections_msg = Detection2DArray()
        detections_msg.header = msg.header

        for result in results:
            boxes = result.boxes

            for box in boxes:
                detection = Detection2D()

                # Bounding box
                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                detection.bbox.center.position.x = float((x1 + x2) / 2)
                detection.bbox.center.position.y = float((y1 + y2) / 2)
                detection.bbox.size_x = float(x2 - x1)
                detection.bbox.size_y = float(y2 - y1)

                # Classe e confiança
                hypothesis = ObjectHypothesisWithPose()
                hypothesis.hypothesis.class_id = str(int(box.cls[0]))
                hypothesis.hypothesis.score = float(box.conf[0])

                detection.results.append(hypothesis)
                detections_msg.detections.append(detection)

                # Desenhar na imagem
                cv2.rectangle(cv_image, (int(x1), int(y1)), (int(x2), int(y2)),
                              (0, 255, 0), 2)
                label = f"{self.model.names[int(box.cls[0])]} {box.conf[0]:.2f}"
                cv2.putText(cv_image, label, (int(x1), int(y1)-10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        # Publicar detecções
        self.detection_pub.publish(detections_msg)

        # Exibir imagem
        cv2.imshow("YOLOv8 Detections", cv_image)
        cv2.waitKey(1)

def main():
    rclpy.init()
    node = YOLODetectorNode()
    rclpy.spin(node)
    node.destroy_node()
    rclpy.shutdown()

if __name__ == '__main__':
    main()

Rodar:

# Instalar ultralytics
pip3 install ultralytics

# Rodar detector
ros2 run g1_perception yolo_detector

3D Object Localization

class Object3DLocalizer:
    def __init__(self, camera_intrinsics):
        self.fx = camera_intrinsics['fx']
        self.fy = camera_intrinsics['fy']
        self.cx = camera_intrinsics['cx']
        self.cy = camera_intrinsics['cy']

    def localize_object(self, bbox: tuple, depth_image: np.ndarray) -> dict:
        """
        Calcula posição 3D do objeto

        Args:
            bbox: (x1, y1, x2, y2) bounding box
            depth_image: Depth map [m]

        Returns:
            dict com 'position' (x, y, z) em metros
        """
        x1, y1, x2, y2 = bbox

        # Centro do bbox
        u = int((x1 + x2) / 2)
        v = int((y1 + y2) / 2)

        # Profundidade no centro
        z = depth_image[v, u]

        if z == 0:
            return None  # Depth inválido

        # Backproject para 3D
        x = (u - self.cx) * z / self.fx
        y = (v - self.cy) * z / self.fy

        return {
            'position': (x, y, z),
            'distance': np.sqrt(x**2 + y**2 + z**2)
        }

📊 Perception Pipeline Completo

#!/usr/bin/env python3
"""
perception_system.py - Sistema de percepção completo
"""

from dataclasses import dataclass
from typing import List

@dataclass
class DetectedObject:
    """Objeto detectado"""
    class_name: str
    confidence: float
    bbox_2d: tuple  # (x1, y1, x2, y2)
    position_3d: tuple  # (x, y, z)
    distance: float

class PerceptionSystem(Node):
    def __init__(self):
        super().__init__('perception_system')

        # YOLO detector
        self.yolo = YOLO('yolov8n.pt')

        # 3D localizer
        intrinsics = {
            'fx': 640.0, 'fy': 640.0,
            'cx': 640.0, 'cy': 360.0
        }
        self.localizer = Object3DLocalizer(intrinsics)

        # Obstacle detector
        self.obstacle_detector = ObstacleDetector()

        # Subscribers
        self.rgb_sub = self.create_subscription(
            Image, '/camera_left/color/image_raw',
            self.rgb_callback, 10
        )
        self.depth_sub = self.create_subscription(
            Image, '/camera_left/depth/image_rect_raw',
            self.depth_callback, 10
        )

        # Estado
        self.latest_rgb = None
        self.latest_depth = None
        self.detected_objects: List[DetectedObject] = []

        # Timer para processar
        self.create_timer(0.1, self.process_perception)  # 10 Hz

        self.bridge = CvBridge()

    def rgb_callback(self, msg):
        self.latest_rgb = self.bridge.imgmsg_to_cv2(msg, 'bgr8')

    def depth_callback(self, msg):
        depth_raw = self.bridge.imgmsg_to_cv2(msg, desired_encoding='16UC1')
        self.latest_depth = depth_raw.astype(np.float32) / 1000.0  # mm -> m

    def process_perception(self):
        """Pipeline de percepção"""
        if self.latest_rgb is None or self.latest_depth is None:
            return

        # 1. Detectar objetos (YOLO)
        results = self.yolo(self.latest_rgb, verbose=False)

        self.detected_objects.clear()

        for result in results:
            for box in result.boxes:
                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                class_id = int(box.cls[0])
                confidence = float(box.conf[0])

                # 2. Localizar em 3D
                loc_3d = self.localizer.localize_object(
                    (x1, y1, x2, y2), self.latest_depth
                )

                if loc_3d is not None:
                    obj = DetectedObject(
                        class_name=self.yolo.names[class_id],
                        confidence=confidence,
                        bbox_2d=(x1, y1, x2, y2),
                        position_3d=loc_3d['position'],
                        distance=loc_3d['distance']
                    )
                    self.detected_objects.append(obj)

        # 3. Detectar obstáculos
        obstacles = self.obstacle_detector.detect_obstacles(self.latest_depth)

        # 4. Visualizar
        self.visualize(self.latest_rgb, obstacles)

        # 5. Log
        self.log_perception()

    def visualize(self, img, obstacles):
        """Visualiza percepção"""
        img_vis = img.copy()

        # Desenhar objetos detectados
        for obj in self.detected_objects:
            x1, y1, x2, y2 = [int(c) for c in obj.bbox_2d]
            cv2.rectangle(img_vis, (x1, y1), (x2, y2), (0, 255, 0), 2)

            label = f"{obj.class_name} {obj.confidence:.2f} ({obj.distance:.2f}m)"
            cv2.putText(img_vis, label, (x1, y1-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        # Desenhar obstáculos
        for (x, y, w, h) in obstacles:
            cv2.rectangle(img_vis, (x, y), (x+w, y+h), (0, 0, 255), 1)

        cv2.imshow("Perception", img_vis)
        cv2.waitKey(1)

    def log_perception(self):
        """Log estado da percepção"""
        if len(self.detected_objects) > 0:
            self.get_logger().info(f"Detected {len(self.detected_objects)} objects:")
            for obj in self.detected_objects:
                self.get_logger().info(
                    f"  - {obj.class_name}: {obj.distance:.2f}m @ {obj.position_3d}"
                )

def main():
    rclpy.init()
    node = PerceptionSystem()
    rclpy.spin(node)
    node.destroy_node()
    rclpy.shutdown()

✅ Checklist de Conclusão

Acessei câmeras RealSense D435i (ROS2 ou Unitree SDK)
Gerei point clouds de depth images
Implementei obstacle detection
Configurei SLAM (ORB-SLAM3 ou RTAB-Map)
Integrei YOLOv8 para object detection
Implementei 3D object localization
Criei Perception System completo

🔗 Próximos Passos

Próximo Módulo

🤖 Autonomia Completa →

Integre todos os sistemas: state machine, decision making, error recovery e navegação multi-floor.

⏱️ Tempo estimado: 80-100 min 🧠 Nível: Avançado 💻 Hands-on: 75% prático, 25% teórico

📷 Intel RealSense D435i​

Especificações do Hardware​

Acessando Câmeras​

🗺️ Depth Processing​

Point Cloud Generation​

Obstacle Detection​

🗺️ SLAM (Simultaneous Localization and Mapping)​

ORB-SLAM3 com RealSense​

RTAB-Map (Alternative)​

🎯 Object Detection​

YOLOv8 Integration​

3D Object Localization​

📊 Perception Pipeline Completo​

✅ Checklist de Conclusão​

🔗 Próximos Passos​