Thursday, May 7, 2026
#!/bin/bash
# ================================================================
# Script : exadata_rac_copilot_triage.sh
# Purpose: Collect Oracle RAC + Exadata performance data and
# generate a structured prompt for Copilot-assisted triage.
# Scope : Oracle 19c+ RAC on Exadata Database Machine (X5–X10M)
# Output : exadata_copilot_evidence_.log (drop into Copilot)
# copilot_prompt_.txt (paste first)
# ================================================================
# Usage : ./exadata_rac_copilot_triage.sh
# Example: ./exadata_rac_copilot_triage.sh INC0012345
# Prereqs: Run as oracle user with env sourced, passwordless SSH
# (dcli) to cell nodes, and root-equivalent for ibstat.
# ================================================================
set -o pipefail
# ---- Config ----
TICKET="${1:-UNKNOWN}"
REPORT_DIR="/tmp/exadata_copilot_${TICKET}_$(date +%Y%m%d_%H%M%S)"
EVIDENCE_FILE="${REPORT_DIR}/exadata_copilot_evidence_${TICKET}.log"
PROMPT_FILE="${REPORT_DIR}/copilot_prompt_${TICKET}.txt"
SQL_OUTPUT="${REPORT_DIR}/exadata_db_metrics.txt"
CELL_OUTPUT="${REPORT_DIR}/cell_metrics.txt"
# Exadata-specific paths
CELL_GROUP_FILE="${HOME}/cell_group" # dcli cell group file
EXAWATCHER_HOME="/opt/oracle.ExaWatcher"
EXACHK_HOME="/opt/oracle.SupportTools/exachk"
mkdir -p "${REPORT_DIR}"
exec > >(tee -a "${EVIDENCE_FILE}") 2>&1
echo "============================================================"
echo "Exadata RAC Copilot Triage — Started at $(date)"
echo "ServiceNow Ticket : ${TICKET}"
echo "Hostname : $(hostname)"
echo "Report Directory : ${REPORT_DIR}"
echo "============================================================"
# ================================================================
# SECTION 1 : Exadata Platform Identification
# ================================================================
echo ""
echo "========== EXADATA PLATFORM IDENTIFICATION =========="
echo "TIMESTAMP: $(date -u '+%Y-%m-%dT%H:%M:%SZ')"
echo "--- imageinfo (Exadata version) ---"
imageinfo 2>/dev/null || echo "imageinfo not available (non-Exadata?)"
echo ""
echo "--- imageversion ---"
cat /etc/imageversion 2>/dev/null || echo "imageversion not found"
echo ""
echo "--- Exadata Model Detection ---"
dmidecode -s system-product-name 2>/dev/null | grep -i exadata || \
grep -i exadata /sys/class/dmi/id/product_name 2>/dev/null || \
echo "Product name: $(cat /sys/class/dmi/id/product_name 2>/dev/null)"
echo "--- dbnode info ---"
dbnodeupdate -i 2>/dev/null || echo "dbnodeupdate not available"
echo ""
echo "--- OS Release ---"
cat /etc/oracle-release 2>/dev/null || cat /etc/os-release 2>/dev/null | head -5
echo ""
echo "--- CPU Info ---"
lscpu 2>/dev/null | grep -E "Model name|CPU\(s\)|Thread|Core|Socket|NUMA"
echo ""
echo "--- Memory Info ---"
free -h 2>/dev/null
grep -i huge /proc/meminfo 2>/dev/null | head -6
# ================================================================
# SECTION 2 : InfiniBand Fabric Health
# ================================================================
echo ""
echo "========== INFINIBAND FABRIC HEALTH =========="
echo "--- ibstat (HCA state) ---"
ibstat 2>/dev/null | grep -E "State|Rate|Link|CA type" || echo "ibstat not available"
echo ""
echo "--- ibstatus ---"
ibstatus 2>/dev/null | head -20 || echo "ibstatus not available"
echo ""
echo "--- InfiniBand Link Errors ---"
ibqueryerrors.pl -r 2>/dev/null | head -30 || echo "ibqueryerrors not available"
echo ""
echo "--- Network Counters (ifconfig/ip) ---"
ip -s link show 2>/dev/null | grep -A5 -E "ib[0-9]|bondib" | grep -E "ib[0-9]|bondib|dropped|overrun|errors" || \
ifconfig -a 2>/dev/null | grep -A6 -E "^ib[0-9]" | grep -E "ib|drop|error"
echo ""
echo "--- UDP Buffer Settings ---"
sysctl net.core.rmem_max net.core.wmem_max net.core.rmem_default net.core.wmem_default 2>/dev/null
# ================================================================
# SECTION 3 : Exadata Cell Health (via dcli + cellcli)
# ================================================================
echo ""
echo "========== CELL HEALTH (CellCLI via dcli) =========="
if [ -f "${CELL_GROUP_FILE}" ]; then
echo "Cell group file found at ${CELL_GROUP_FILE}"
echo ""
echo "--- LIST CELL DETAIL ---"
dcli -g "${CELL_GROUP_FILE}" -l root "cellcli -e 'list cell detail'" 2>/dev/null | head -40
echo ""
echo "--- LIST CELL DISK DETAIL ---"
dcli -g "${CELL_GROUP_FILE}" -l root "cellcli -e 'list celldisk detail'" 2>/dev/null | head -60
echo ""
echo "--- LIST GRIDDISK ---"
dcli -g "${CELL_GROUP_FILE}" -l root "cellcli -e 'list griddisk'" 2>/dev/null | head -40
echo ""
echo "--- LIST IORMPLAN DETAIL ---"
dcli -g "${CELL_GROUP_FILE}" -l root "cellcli -e 'list iormplan detail'" 2>/dev/null | head -30
echo ""
echo "--- FLASHCACHE STATUS ---"
dcli -g "${CELL_GROUP_FILE}" -l root "cellcli -e 'list flashcache detail'" 2>/dev/null | head -20
echo ""
echo "--- METRICCURRENT: CELL (CPU, IOPS, MBPS, latency) ---"
dcli -g "${CELL_GROUP_FILE}" -l root \
"cellcli -e 'list metriccurrent where objectType=\"CELL\"'" 2>/dev/null | head -40
echo ""
echo "--- METRICCURRENT: CELLDISK ---"
dcli -g "${CELL_GROUP_FILE}" -l root \
"cellcli -e 'list metriccurrent where objectType=\"CELLDISK\"'" 2>/dev/null | head -40
echo ""
echo "--- METRICCURRENT: FLASHCACHE ---"
dcli -g "${CELL_GROUP_FILE}" -l root \
"cellcli -e 'list metriccurrent where objectType=\"FLASHCACHE\"'" 2>/dev/null | head -30
echo ""
echo "--- ACTIVE ALERTS (last 20) ---"
dcli -g "${CELL_GROUP_FILE}" -l root \
"cellcli -e 'list alerthistory where severity != \"clear\"'" 2>/dev/null | head -20
echo ""
echo "--- CELLSRV STATS (last 10 lines from ExaWatcher) ---"
dcli -g "${CELL_GROUP_FILE}" -l root \
"tail -10 /opt/oracle.ExaWatcher/archive/$(ls -t /opt/oracle.ExaWatcher/archive/ 2>/dev/null | grep -i cellsrv | head -1)" 2>/dev/null
else
echo "WARNING: ${CELL_GROUP_FILE} not found. Cell-level metrics unavailable."
echo "Create this file with one cell hostname per line for full coverage."
fi
# ================================================================
# SECTION 4 : Grid Infrastructure / Clusterware
# ================================================================
echo ""
echo "========== GRID INFRASTRUCTURE / CLUSTERWARE =========="
echo "--- olsnodes ---"
olsnodes -s -t 2>/dev/null || echo "olsnodes not available"
echo ""
echo "--- crsctl check cluster ---"
crsctl check cluster -all 2>/dev/null || echo "crsctl not available"
echo ""
echo "--- crsctl stat res (Exadata resources) ---"
crsctl stat res -t 2>/dev/null | head -30
echo ""
echo "--- srvctl config database ---"
DBNAME=$(ps -ef | grep pmon | grep -v grep | grep -v asm | awk -F_ '{print $3}' | head -1)
if [ -n "${DBNAME}" ]; then
srvctl config database -d "${DBNAME}" 2>/dev/null
echo ""
srvctl status database -d "${DBNAME}" 2>/dev/null
else
echo "Could not auto-detect DB name"
fi
echo ""
echo "--- ASM Disk Group Usage ---"
asmcmd lsdg 2>/dev/null || echo "asmcmd not available — check grid home"
# ================================================================
# SECTION 5 : Exadata Database Diagnostics (SQL)
# ================================================================
echo ""
echo "========== DATABASE DIAGNOSTICS (SQL via SQL*Plus) =========="
SQLPLUS="$(which sqlplus 2>/dev/null || echo 'sqlplus')"
cat > "${REPORT_DIR}/exadata_rac_metrics.sql" << 'EOSQL'
-- ================================================================
-- Exadata RAC Copilot Triage SQL — All Exadata-specific metrics
-- ================================================================
SET LINESIZE 400
SET PAGESIZE 50000
SET VERIFY OFF FEEDBACK OFF HEADING ON TIMING OFF
-- ================================================================
-- COPILOT_AUTO_INSTRUCT: DATABASE & EXADATA IDENTIFICATION
-- ================================================================
PROMPT === DB Identification ===
SELECT 'INSTANCE: '||instance_name||' #'||instance_number||' on '||host_name||
' version '||version_full||' started '||
TO_CHAR(startup_time,'YYYY-MM-DD HH24:MI') AS info FROM v$instance;
PROMPT
PROMPT === PDB Status ===
SELECT con_id, name, open_mode, restricted, total_size/1024/1024/1024 size_gb
FROM v$pdbs ORDER BY con_id;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: CELL HEALTH FROM DATABASE SIDE ===
PROMPT ============================================================
PROMPT Every cell must be ONLINE. Any OFFLINE/ERROR cell is RED.
PROMPT AVG_TIME > 5ms indicates cell latency issue.
PROMPT Compare IOPS and MBPS across cells — imbalance may indicate
PROMPT hot spots or failing components.
PROMPT
PROMPT === V$CELL (Cell summary) ===
SELECT cell_name, cell_state, cell_type, cell_version,
iops, mbps, total_io_requests, avg_time/1000 avg_time_ms
FROM v$cell ORDER BY cell_name;
PROMPT
PROMPT === V$CELL_CONFIG ===
SELECT cell_name, flashcache_mode, cpu_count, mem_total_mb, mem_free_mb,
disk_count, inter_connect_type
FROM v$cell_config;
PROMPT
PROMPT === V$CELL_STATE (per-cell thread pool) ===
SELECT cell_name, state, num_threads, num_active_threads,
num_idle_threads, num_rq_threads
FROM v$cell_state ORDER BY cell_name;
PROMPT
PROMPT === V$CELL_GLOBAL (global cell metrics) ===
SELECT cell_name, flashcache_read_iops, flashcache_write_iops,
flashcache_hit_ratio, disk_read_iops, disk_write_iops,
smart_io_read_iops, smart_io_write_iops,
cpu_util_percent, mem_util_percent
FROM v$cell_global ORDER BY cell_name;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: OFFLOAD / SMART SCAN EFFICIENCY ===
PROMPT ============================================================
PROMPT Offload efficiency = (eligible bytes - returned bytes) / eligible bytes × 100.
PROMPT Efficiency > 80%: GREEN; 40-80%: YELLOW; < 40%: RED.
PROMPT Low efficiency means Smart Scan is not functioning as expected.
PROMPT Reasons: uncommitted data, small segments, resource shortages,
PROMPT serial queries, or cell offload disabled.
PROMPT
PROMPT === Smart Scan / Offload Efficiency (v$sysstat) ===
SELECT name, value
FROM v$sysstat
WHERE name IN (
'physical read total bytes',
'physical read total IO requests',
'cell physical IO interconnect bytes',
'cell physical IO interconnect bytes returned by smart scan',
'cell physical IO bytes eligible for predicate offload',
'cell physical IO bytes saved by storage index',
'cell physical IO bytes saved during optimized file creation',
'cell flash cache read hits',
'cell writes to flash cache',
'cell overwrites in flash cache',
'cell partial writes in flash cache',
'physical read requests optimized',
'physical read total bytes optimized',
'cell IO uncompressed bytes'
)
ORDER BY name;
PROMPT
PROMPT --- Per-Instance Smart Scan Efficiency ---
SELECT inst_id,
SUM(CASE WHEN stat_name='cell physical IO bytes eligible for predicate offload'
THEN value END) AS eligible_bytes,
SUM(CASE WHEN stat_name='cell physical IO interconnect bytes'
THEN value END) AS total_io_bytes,
SUM(CASE WHEN stat_name='cell physical IO interconnect bytes returned by smart scan'
THEN value END) AS returned_bytes,
ROUND((1 - SUM(CASE WHEN stat_name='cell physical IO interconnect bytes returned by smart scan'
THEN value END) /
NULLIF(SUM(CASE WHEN stat_name='cell physical IO interconnect bytes'
THEN value END),0)) * 100, 1) AS offload_efficiency_pct
FROM gv$sysstat
WHERE stat_name IN ('cell physical IO bytes eligible for predicate offload',
'cell physical IO interconnect bytes',
'cell physical IO interconnect bytes returned by smart scan')
GROUP BY inst_id ORDER BY inst_id;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: STORAGE INDEX EFFECTIVENESS ===
PROMPT ============================================================
PROMPT "cell physical IO bytes saved by storage index" > 0 confirms
PROMPT storage indexes are eliminating disk I/O.
PROMPT "physical read requests optimized" includes both flash cache
PROMPT AND storage index savings.
PROMPT Low savings vs eligible bytes = indexes ineffective or not built.
PROMPT
PROMPT === Storage Index Savings ===
SELECT inst_id,
SUM(CASE WHEN stat_name='cell physical IO bytes saved by storage index'
THEN value END) AS bytes_saved_by_storage_index,
SUM(CASE WHEN stat_name='physical read total bytes optimized'
THEN value END) AS bytes_optimized_total
FROM gv$sysstat
WHERE stat_name IN ('cell physical IO bytes saved by storage index',
'physical read total bytes optimized')
GROUP BY inst_id ORDER BY inst_id;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: FLASH CACHE METRICS ===
PROMPT ============================================================
PROMPT "cell flash cache read hits" = reads satisfied from flash.
PROMPT Compare flash cache read hits to physical read total IO requests
PROMPT to compute effective flash cache hit ratio.
PROMPT Flash cache hit ratio > 90%: GREEN; < 70%: investigate.
PROMPT
PROMPT === Flash Cache Hit Ratio (per instance) ===
SELECT inst_id,
SUM(CASE WHEN stat_name='cell flash cache read hits' THEN value END) flash_hits,
SUM(CASE WHEN stat_name='physical read total IO requests' THEN value END) total_reads,
ROUND(SUM(CASE WHEN stat_name='cell flash cache read hits' THEN value END) /
NULLIF(SUM(CASE WHEN stat_name='physical read total IO requests' THEN value END),0)*100,1) flash_hit_pct
FROM gv$sysstat
WHERE stat_name IN ('cell flash cache read hits','physical read total IO requests')
GROUP BY inst_id ORDER BY inst_id;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: EXADATA WAIT EVENTS ===
PROMPT ============================================================
PROMPT cell smart table scan / cell smart index scan = Smart Scan active (GREEN).
PROMPT cell multiblock physical read = reads into buffer cache (no Smart Scan).
PROMPT cell single block physical read = OLTP single-block reads.
PROMPT cell list of blocks physical read = multi-block random read.
PROMPT If "cell multiblock" dominates over "cell smart table scan"
PROMPT
PROMPT during scan-heavy workloads → Smart Scan not working (YELLOW/RED).
PROMPT === All cell wait events (per instance) ===
SELECT inst_id, event, wait_class, total_waits,
ROUND(time_waited_micro/1e6,1) time_waited_sec,
ROUND(time_waited_micro/NULLIF(total_waits,0)/1000,3) avg_wait_ms
FROM gv$system_event
WHERE event LIKE 'cell%'
ORDER BY inst_id, time_waited_micro DESC;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: CELL THREAD HISTORY (ASH for Cells) ===
PROMPT ============================================================
PROMPT v$cell_thread_history shows recent cell offload activity.
PROMPT "busy" state > 10% of samples: YELLOW (cell under load).
PROMPT Look for "msg lock" or "msg wait"—indicates cell coordination delays.
PROMPT === V$CELL_THREAD_HISTORY (last 30 rows) ===
SELECT * FROM (
SELECT cell_name, cell_state, thread_state, thread_activity,
wait_class, event, sql_id, db_name,
ROUND(cpu_time_ms,1) cpu_ms, ROUND(elapsed_time_ms,1) elapsed_ms
FROM v$cell_thread_history
ORDER BY sample_time DESC
) WHERE ROWNUM <= 30;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: RAC CLUSTER HEALTH ===
PROMPT ============================================================
PROMPT === Cluster Interconnects ===
SELECT * FROM gv$cluster_interconnects;
PROMPT === Interconnect Ping ===
SELECT inst1.instance_number src, inst2.instance_number dst,
elap.ping_latency_ms
FROM v$instance_ping elap,
(SELECT instance_number FROM v$instance) inst1,
(SELECT instance_number FROM v$instance) inst2
WHERE inst1.instance_number = elap.instance_number(+);
PROMPT === GES Blocking Enqueues ===
SELECT inst_id, resource_name1, resource_name2, state, owner_node, blocked, blocker
FROM gv$ges_blocking_enqueue WHERE blocker = 1;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: PROCESS & SESSION CAPACITY ===
PROMPT ============================================================
PROMPT Per-instance AND cluster total. GREEN: <65%, YELLOW: 65-85%, RED: >85%.
SELECT NVL(TO_CHAR(p.inst_id),'CLUSTER_TOTAL') AS inst_id,
COUNT(*) AS active_processes,
MAX(p.max_proc) AS max_processes,
ROUND(COUNT(*)/MAX(p.max_proc)*100,2) AS pct_used
FROM (SELECT inst_id FROM gv$process) p
CROSS JOIN (SELECT inst_id, TO_NUMBER(VALUE) max_proc
FROM gv$parameter WHERE name='processes') m
WHERE p.inst_id = m.inst_id
GROUP BY ROLLUP(p.inst_id) ORDER BY p.inst_id;
SELECT inst_id, status, type, COUNT(*) AS session_count
FROM gv$session GROUP BY inst_id, status, type
ORDER BY inst_id, status, type;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: CPU SATURATION (per instance) ===
PROMPT ============================================================
PROMPT cpu_wait_sec = db_time_sec - db_cpu_sec. If large→CDB CPU starved.
PROMPT Note: on Exadata, Smart Scan offloads CPU from DB to cells,
PROMPT so DB CPU may appear lower than expected for I/O workloads.
SELECT inst_id,
ROUND(SUM(CASE WHEN stat_name='DB time' THEN value END)/1e6,2) db_time_sec,
ROUND(SUM(CASE WHEN stat_name='DB CPU' THEN value END)/1e6,2) db_cpu_sec,
ROUND((SUM(CASE WHEN stat_name='DB time' THEN value END)-
SUM(CASE WHEN stat_name='DB CPU' THEN value END))/1e6,2) cpu_wait_sec
FROM gv$sys_time_model GROUP BY inst_id ORDER BY inst_id;
SELECT inst_id, value AS cpu_count
FROM gv$parameter WHERE name='cpu_count' ORDER BY inst_id;
SELECT inst_id, name, is_top_plan, cpu_managed, utilization_limit
FROM gv$rsrc_plan WHERE is_top_plan='TRUE' ORDER BY inst_id;
PROMPT === OS CPU Busy/Idle (gv$osstat) ===
SELECT inst_id,
ROUND(SUM(CASE WHEN stat_name='BUSY_TIME' THEN value END)/100,1) cpu_busy_sec,
ROUND(SUM(CASE WHEN stat_name='IDLE_TIME' THEN value END)/100,1) cpu_idle_sec,
ROUND(SUM(CASE WHEN stat_name='RSRC_MGR_CPU_WAIT_TIME' THEN value END)/100,1) rm_wait_sec,
SUM(CASE WHEN stat_name='NUM_CPUS' THEN value END) num_cpus
FROM gv$osstat
WHERE stat_name IN ('BUSY_TIME','IDLE_TIME','RSRC_MGR_CPU_WAIT_TIME','NUM_CPUS')
GROUP BY inst_id ORDER BY inst_id;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: RAC GC WAIT EVENTS ===
PROMPT ============================================================
PROMPT gc block lost > 0: RED (InfiniBand packet loss).
PROMPT gc cr/current block busy: hot-block contention.
PROMPT 3-way > 5% of 2-way: YELLOW. Check interconnect latency.
PROMPT === All GC wait events (per instance) ===
SELECT inst_id, event, total_waits,
ROUND(time_waited_micro/1e6,1) time_waited_sec,
ROUND(time_waited_micro/NULLIF(total_waits,0)/1000,3) avg_wait_ms
FROM gv$system_event
WHERE event LIKE 'gc%'
ORDER BY inst_id, time_waited_micro DESC;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: TOP ASH WAITS & SQL (last hour) ===
PROMPT ============================================================
PROMPT --- Wait Classes (last hour, all instances) ---
SELECT wait_class, COUNT(*) AS samples,
ROUND(COUNT(*)/36,0) AS est_active_sessions
FROM gv$active_session_history
WHERE sample_time > SYSDATE - 1/24
AND session_state = 'WAITING' AND wait_class <> 'Idle'
GROUP BY wait_class ORDER BY samples DESC;
PROMPT --- Top 10 Wait Events (last hour) ---
SELECT event, wait_class, COUNT(*) AS samples
FROM gv$active_session_history
WHERE sample_time > SYSDATE - 1/24
AND session_state = 'WAITING' AND wait_class <> 'Idle'
GROUP BY event, wait_class ORDER BY samples DESC FETCH FIRST 10 ROWS ONLY;
PROMPT --- Top SQL by ASH (last hour) ---
SELECT sql_id, COUNT(*) AS ash_samples
FROM gv$active_session_history
WHERE sample_time > SYSDATE - 1/24 AND sql_id IS NOT NULL
GROUP BY sql_id ORDER BY ash_samples DESC FETCH FIRST 10 ROWS ONLY;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: BLOCKING SESSIONS ===
PROMPT ============================================================
SELECT a.inst_id blocker_inst, a.sid blocker_sid,
a.username blocker_user, a.event blocker_event,
b.inst_id waiter_inst, b.sid waiter_sid,
b.username waiter_user, b.event waiter_event, b.seconds_in_wait
FROM gv$session a
JOIN gv$session b ON a.sid=b.final_blocking_session
AND a.inst_id=b.final_blocking_instance
WHERE b.final_blocking_session IS NOT NULL
ORDER BY b.seconds_in_wait DESC;
SELECT l.inst_id, l.sid, s.username, l.type, l.lmode, l.request, l.block, l.ctime
FROM gv$lock l JOIN gv$session s ON l.sid=s.sid AND l.inst_id=s.inst_id
WHERE l.block > 0 AND l.type IN ('TX','TM')
ORDER BY l.inst_id, l.sid;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: MEMORY & I/O ===
PROMPT ============================================================
SELECT inst_id, name, ROUND(bytes/1024/1024) size_mb, resizeable
FROM gv$sgainfo ORDER BY inst_id, name;
SELECT inst_id,
ROUND(SUM(pga_alloc_mem)/1024/1024/1024,2) total_pga_gb,
ROUND(SUM(pga_used_mem)/1024/1024/1024,2) used_pga_gb
FROM gv$process GROUP BY inst_id ORDER BY inst_id;
PROMPT === Tablespace & FRA Space ===
SELECT tablespace_name, ROUND(SUM(bytes)/1024/1024/1024,2) total_gb,
ROUND(SUM(maxbytes)/1024/1024/1024,2) max_gb
FROM dba_data_files GROUP BY tablespace_name ORDER BY total_gb DESC;
SELECT ROUND(SPACE_LIMIT/1024/1024/1024,2) fra_total_gb,
ROUND(SPACE_USED/1024/1024/1024,2) fra_used_gb,
ROUND(SPACE_USED/SPACE_LIMIT*100,2) fra_pct
FROM v$recovery_file_dest;
PROMPT === Data Guard ===
SELECT * FROM v$dataguard_stats;
PROMPT === SQL Monitor (Top 10) ===
SELECT inst_id, sql_id, sql_exec_id, status,
ROUND(elapsed_time/1e6,1) elapsed_sec,
ROUND(cpu_time/1e6,1) cpu_sec,
ROUND(user_io_wait_time/1e6,1) io_wait_sec,
ROUND(temp_space_allocated/1024/1024,1) temp_mb
FROM gv$sql_monitor
WHERE status IN ('EXECUTING','DONE') AND (cpu_time+user_io_wait_time) > 0
ORDER BY elapsed_time DESC FETCH FIRST 10 ROWS ONLY;
PROMPT === Recent Alert Log Errors ===
SELECT TO_CHAR(originating_timestamp,'YYYY-MM-DD HH24:MI') alert_time, message_text
FROM v$diag_alert_ext
WHERE (message_text LIKE '%ORA-%' OR message_text LIKE '%error%'
OR message_text LIKE '%fail%' OR message_text LIKE '%cell%'
OR message_text LIKE '%offload%' OR message_text LIKE '%flash%')
AND originating_timestamp > SYSDATE - 1 AND ROWNUM <= 50
ORDER BY originating_timestamp DESC;
PROMPT
PROMPT ============================================================
PROMPT === COPILOT_AUTO_INSTRUCT: DISK I/O DISTRIBUTION ===
PROMPT ============================================================
SELECT file_name, tablespace_name,
ROUND(bytes/1024/1024/1024,2) size_gb
FROM dba_data_files ORDER BY bytes DESC FETCH FIRST 20 ROWS ONLY;
PROMPT
PROMPT ============================================================
PROMPT === END EXADATA DATABASE METRICS ===
PROMPT ============================================================
EOSQL
$SQLPLUS -S / as sysdba @"${REPORT_DIR}/exadata_rac_metrics.sql" > "${SQL_OUTPUT}" 2>&1
RC=$?
if [ $RC -ne 0 ]; then
echo "WARNING: SQL*Plus exited with code $RC. Output may be incomplete."
fi
cat "${SQL_OUTPUT}" >> "${EVIDENCE_FILE}"
# ================================================================
# SECTION 6 : ExaWatcher Recent Data (if available)
# ================================================================
echo ""
echo "========== EXAWATCHER RECENT DATA =========="
if [ -d "${EXAWATCHER_HOME}" ]; then
LATEST_IOSTAT=$(find "${EXAWATCHER_HOME}/archive" -name "*iostat*" -type f -mmin -120 2>/dev/null | tail -1)
LATEST_CELLSRV=$(find "${EXAWATCHER_HOME}/archive" -name "*cellsrvstat*" -type f -mmin -120 2>/dev/null | tail -1)
if [ -n "${LATEST_IOSTAT}" ]; then
echo "--- Last 30 lines of ExaWatcher iostat ---"
tail -30 "${LATEST_IOSTAT}"
fi
if [ -n "${LATEST_CELLSRV}" ]; then
echo "--- Last 30 lines of ExaWatcher cellsrvstat ---"
tail -30 "${LATEST_CELLSRV}"
fi
if [ -z "${LATEST_IOSTAT}" ] && [ -z "${LATEST_CELLSRV}" ]; then
echo "No recent ExaWatcher data found (last 2 hours). ExaWatcher may not be running."
fi
else
echo "ExaWatcher not found at ${EXAWATCHER_HOME}."
echo "ExaWatcher should be running by default on all Exadata nodes."
fi
# ================================================================
# SECTION 7 : EXAchk Reference (if available)
# ================================================================
echo ""
echo "========== EXAchk REFERENCE =========="
if [ -f "${EXACHK_HOME}/exachk" ]; then
echo "EXAchk found at ${EXACHK_HOME}/exachk"
echo "Consider running: ${EXACHK_HOME}/exachk -profile asm,db,cell"
echo "EXAchk provides comprehensive compliance and best-practice checks beyond this script."
else
echo "EXAchk not found at ${EXACHK_HOME}. Download from MOS Doc ID 1070954.1."
fi
# ================================================================
# SECTION 8 : Build Copilot Prompt
# ================================================================
echo ""
echo "========== BUILDING COPILOT PROMPT =========="
cat > "${PROMPT_FILE}" << 'EOPROMPT'
# Role
You are an expert **Oracle Exadata RAC Performance DBA**. You triage incidents using evidence from OS, InfiniBand fabric, CellCLI, and Exadata-specific database diagnostics. You provide a **confidence score (0–100%)** for every finding with clear rationale.
# Context
- **ServiceNow Ticket**:
- **Environment**: Oracle 19c+ RAC on Oracle Exadata Database Machine (X5–X10M).
- **Evidence Source**: The attached log file contains comprehensive OS + IB fabric + CellCLI + database metrics collected simultaneously from all nodes and cells.
# Task
Analyze the **ENTIRE attached evidence file** from scratch. Do NOT rely on prior conversation. Produce:
## A) Executive Summary
2–3 sentences describing the overall health (GREEN / YELLOW / RED) and the single most critical finding.
## B) Full On-Call DBA Report for ServiceNow
For each area below, provide findings, a severity verdict, and a **confidence score**:
1. **Platform & Version** – Exadata model, image version, cellsrv version, DB version. Mismatched versions are YELLOW/RED.
2. **InfiniBand Fabric Health** – ibstat link state, ibqueryerrors, UDP buffer sizes. Any link errors or down links are RED.
3. **Cell Health** – V$CELL cell_state (all must be ONLINE), V$CELL_STATE thread pool, cell CPU/memory utilization. OFFLINE/ERROR cells = RED.
4. **Smart Scan / Cell Offload Efficiency** – Offload efficiency % per instance. <40%: RED. Cell offload must be above 80% for scan-heavy workloads.
5. **Storage Index Effectiveness** – Bytes saved by storage index vs total eligible bytes. Zero savings on scan workloads: YELLOW.
6. **Flash Cache** – Flash cache hit ratio, flash cache read hits vs total reads. <70%: YELLOW. Check cell-side FC_BY_USED and FC_IO_BY_R for flash capacity pressure.
7. **RAC Global Cache (Cache Fusion)** – GC wait events. gc block lost > 0: RED. 3-way > 5% of 2-way: YELLOW. Check interconnect latency.
8. **CPU Saturation** – DB_Time vs DB_CPU per instance. Note: On Exadata, Smart Scan offloads CPU to cells, so DB CPU may appear low even during heavy I/O.
9. **IORM (I/O Resource Manager)** – IORM plan, objective, inter-database allocations. If IORM is limiting a database under load, flag as YELLOW.
10. **Process & Session Capacity** – Per-instance. >85%: RED.
11. **Blocking & Locking** – Cross-instance blocking. TX/TM locks.
12. **Memory Stability** – SGA/PGA sizing, no ORA-4031.
13. **Space** – Tablespace >90%, FRA >85%: RED.
14. **Top SQL** – ASH samples, SQL Monitor, temp spill.
15. **Alert Log** – ORA- errors, cell errors, offload errors, flash errors.
16. **Data Guard** – Transport/apply lag (if applicable).
## C) Numeric Summary
Provide a concise table with:
- Exadata Model / Image Version
- DB Version | Instances: N | RAC: Yes | Cells: N (all ONLINE? Y/N)
- Offload Efficiency % (per instance)
- Flash Cache Hit Ratio % (per instance)
- Storage Index Bytes Saved (cluster total)
- Active Processes (per node / cluster total) | Max Processes
- CPU Wait (sec) per instance
- GC Block Lost count (cluster total; 0 = GREEN)
- Blocking Sessions count
- FRA % Used
- Top Wait Event (name + avg ms)
- ORA- Errors in last 24h
## D) Actionable Remediation
List the top 3 immediate actions, ordered by urgency.
# Exadata-Specific Interpretation Rules (MANDATORY)
- **Cell Offload Efficiency** = 1 − (cell physical IO interconnect bytes returned by smart scan / cell physical IO interconnect bytes). A value below 40% on scan-heavy workloads means Smart Scan is not working. Causes: uncommitted data, small segments (< 128 MB default), shared server sessions, cell memory pressure, or `cell_offload_processing=false`.
- **cell smart table scan / cell smart index scan**: These wait events confirm Smart Scan is active. GREEN.
- **cell multiblock physical read**: Reads into buffer cache bypassing Smart Scan. If this dominates over smart scan events during scans → YELLOW/RED.
- **cell single block physical read**: OLTP single-block I/O. Normal. High latency (>2ms) on flash: YELLOW; >5ms on disk: YELLOW.
- **Storage Indexes**: `cell physical IO bytes saved by storage index` quantifies eliminated I/O. Check `physical read requests optimized` for combined flash + storage index effect.
- **Flash Cache**: `cell flash cache read hits` from v$sysstat. Cross-reference with cell-side `FC_IO_BY_R` and `FC_IO_BY_R_MISS` for capacity planning. Zero hits: flash cache may be disabled or bypassed.
- **gc block lost**: RED always. Indicates InfiniBand packet loss. Check UDP buffer sizes (`net.core.rmem_max`), MTU, IB link errors.
- **On Exadata, low DB CPU is NOT necessarily a problem**: Smart Scan offloads filtering to cell CPUs. High `db_time` with moderate `db_cpu` during scans is normal.
- **PX Deq:* waits**: Parallel coordination, NOT blocking. Do NOT escalate.
- **resmgr:cpu quantum**: Expected throttling under Resource Manager.
- **IORM**: If `iormplan objective = auto` and `utilization_limit` is constraining a database, `DB_CPU` wait may appear at database level.
- **V$CELL_THREAD_HISTORY**: The "ASH for Exadata Cells." Thread states of "msg lock" or "msg wait" indicate cell coordination delays. "busy" > 10% means cells are under load.
# Confidence Score Guidance
- **90–100%**: Very clear evidence; all metrics present and consistent.
- **70–89%**: Generally clear, but some inference required or one data source missing.
- **50–69%**: Partial evidence; key metrics missing or ambiguous (e.g., cell metrics unavailable).
- **<50%**: Low confidence; critical data unavailable (e.g., no dcli access, no cell metrics).
# Output Format
Use a clean, structured Markdown format. Include **severity badges** (🟢 GREEN / 🟡 YELLOW / 🔴 RED) and **confidence scores** for each finding.
EOPROMPT
sed -i "s//${TICKET}/g" "${PROMPT_FILE}"
echo ""
echo "============================================================"
echo "=== COLLECTION COMPLETE ==="
echo "============================================================"
echo "Evidence file : ${EVIDENCE_FILE}"
echo "Copilot prompt: ${PROMPT_FILE}"
echo ""
echo "=== NEXT STEPS ==="
echo "1) Open a NEW Copilot session (clear prior context)."
echo "2) Copy-paste the contents of: ${PROMPT_FILE}"
echo "3) Then drag-and-drop the evidence file: ${EVIDENCE_FILE}"
echo "4) Say: 'Analyze the attached evidence file using the instructions above.'"
echo "5) Once analysis is complete: 'Create a downloadable .txt artifact with the full report.'"
echo "============================================================"