CEPH-MIB DEFINITIONS ::= BEGIN

IMPORTS
    MODULE-IDENTITY, NOTIFICATION-TYPE, enterprises
        FROM SNMPv2-SMI
    MODULE-COMPLIANCE, NOTIFICATION-GROUP
        FROM SNMPv2-CONF
;

-- Linting information:
--
-- # smilint -l 6 -i notification-not-reversible ./CEPH-MIB.txt
--
-- ignore: notification-not-reversible since our SNMP gateway doesn't use SNMPv1
--

ceph MODULE-IDENTITY
    LAST-UPDATED
        "202111010000Z" -- Nov 01, 2021
    ORGANIZATION
        "The Ceph Project
         https://ceph.io"
    CONTACT-INFO
        "Email: <dev@ceph.io>

        Send comments to: <dev@ceph.io>"
    DESCRIPTION
        "The MIB module for Ceph. In it's current form it only
        supports Notifications, since Ceph itself doesn't provide
        any SNMP agent functionality.

        Notifications are provided through a Prometheus/Alertmanager
        webhook passing alerts to an external gateway service that is
        responsible for formatting, forwarding and authenticating to
        the SNMP receiver.
        "
    REVISION
        "202111010000Z" --Nov 01, 2021
    DESCRIPTION
        "Latest version including the following updates;

        - MIB restructure to align with linting
        - names shortened and simplified (less verbose)
        - Simplified structure due to switch to https://github.com/maxwo/snmp_notifier
          - objects removed
          - notifications updated
        - Added module compliance
        - Updated to latest prometheus alert rule definitions
        "
    ::= { enterprises 50495 }

cephCluster       OBJECT IDENTIFIER ::= { ceph 1 }
cephConformance   OBJECT IDENTIFIER ::= { ceph 2 }

-- cephMetadata is a placeholder for possible future expansion via an agent
-- where we could provide an overview of the clusters configuration
cephMetadata      OBJECT IDENTIFIER ::= { cephCluster 1 }
cephNotifications OBJECT IDENTIFIER ::= { cephCluster 2 }

prometheus OBJECT IDENTIFIER ::= { cephNotifications 1 }

--
-- Notifications: first we define the notification 'branches' for the
-- different categories of notifications / alerts
promGeneric       OBJECT IDENTIFIER ::= { prometheus 1 }
promHealthStatus  OBJECT IDENTIFIER ::= { prometheus 2 }
promMon           OBJECT IDENTIFIER ::= { prometheus 3 }
promOsd           OBJECT IDENTIFIER ::= { prometheus 4 }
promMds           OBJECT IDENTIFIER ::= { prometheus 5 }
promMgr           OBJECT IDENTIFIER ::= { prometheus 6 }
promPGs           OBJECT IDENTIFIER ::= { prometheus 7 }
promNode          OBJECT IDENTIFIER ::= { prometheus 8 }
promPool          OBJECT IDENTIFIER ::= { prometheus 9 }
promRados         OBJECT IDENTIFIER ::= { prometheus 10 }
promCephadm       OBJECT IDENTIFIER ::= { prometheus 11 }
promPrometheus    OBJECT IDENTIFIER ::= { prometheus 12 }
promNVMeGateway   OBJECT IDENTIFIER ::= { prometheus 14 }

promGenericNotification NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Generic alert issued when the Prometheus rule doesn't provide an OID."
::= { promGeneric 1 }

promGenericDaemonCrash NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more daemons have crashed recently, and are yet to be archived"
::= { promGeneric 2 }

promHealthStatusError NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph in health_error state for too long."
::= { promHealthStatus 1 }

promHealthStatusWarning NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph in health_warn for too long."
::= { promHealthStatus 2 }

promMonLowQuorum NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Monitor count in quorum is low."
::= { promMon 1 }

promMonDiskSpaceCritical NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Monitor diskspace is critically low."
::= { promMon 2 }

promOsdDownHigh NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A high number of OSDs are down."
::= { promOsd 1 }

promOsdDown NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more Osds down."
::= { promOsd 2 }

promOsdNearFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "An OSD is dangerously full."
::= { promOsd 3 }

promOsdFlapping NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes."
::= { promOsd 4 }

promOsdHighPgDeviation NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "An OSD deviates by more then 30% from average PG count."
::= { promOsd 5 }

promOsdFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "An OSD has reached its full threshold."
::= { promOsd 6 }

promOsdHighPredictedFailures NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Normal self healing unable to cope with the number of devices predicted to fail."
::= { promOsd 7 }

promOsdHostDown NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph OSD host is down."
::= { promOsd 8 }

promMdsDamaged NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs filesystem is damaged."
::= { promMds 1 }

promMdsReadOnly NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs filesystem marked as READ-ONLY"
::= { promMds 2 }

promMdsOffline NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs filesystem is unavailable/offline."
::= { promMds 3 }

promMdsDegraded NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs filesystem is in a degraded state."
::= { promMds 4 }

promMdsNoStandby NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs MDS daemon failure, no standby available"
::= { promMds 5 }

promMgrModuleCrash NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph mgr module has crashed recently"
::= { promMgr 1 }

promMgrPrometheusInactive NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph mgr prometheus module not responding"
::= { promMgr 2 }

promPGsInactive NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more PGs are inactive for more than 5 minutes."
::= { promPGs 1 }

promPGsUnclean NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more PGs are not clean for more than 15 minutes."
::= { promPGs 2 }

promPGsUnavailable NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more PGs is unavailable, blocking I/O to those objects."
::= { promPGs 3 }

promPGsDamaged NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more PGs is damaged."
::= { promPGs 4 }

promPGsRecoveryFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "PG recovery is impaired due to full OSDs."
::= { promPGs 5 }

promPGsBackfillFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "PG backfill is impaired due to full OSDs."
::= { promPGs 6 }

promNodeRootVolumeFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)."
::= { promNode 1 }

promNodeNetworkPacketDrops NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface."
::= { promNode 2 }

promNodeNetworkPacketErrors NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface."
::= { promNode 3 }

promNodeStorageFilling NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
::= { promNode 4 }

promPoolFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A pool is at 90% capacity or over."
::= { promPool 1 }

promPoolFilling NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
::= { promPool 2 }

promRadosUnfound NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A RADOS object can not be found, even though all OSDs are online."
::= { promRados 1 }

promRadosRBDMirrorImagesVeryHigh NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Number of RBD image replications are very high."
::= { promRados 2 }

promRadosRBDMirrorUnsyncImages NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Local RBD images are not in sync with the remote counter parts"
::= { promRados 3 }

promRadosRBDMirrorUnsyncImagesHigh NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "There is a high percentage of un-sync RBD images."
::= { promRados 4 }

promRadosRBDMirrorHighBandwidth NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A high bandwidth usage is detected during RBD image transfers."
::= { promRados 5 }

promCephadmDaemonDown NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephadm has determined that a daemon is down."
::= { promCephadm 1 }

promCephadmUpgradeFailure NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephadm attempted to upgrade the cluster and encountered a problem."
::= { promCephadm 2 }

promPrometheusJobMissing NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "The prometheus scrape job is not defined."
::= { promPrometheus 1 }

promNVMeGatewayNicDown NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A NIC used for NVMe gateway client traffic is down."
::= { promNVMeGateway 1 }
-- ---------------------------------------------------------- --
-- IEEE 802.1D MIB - Conformance Information
-- ---------------------------------------------------------- --

cephAlertGroups   OBJECT IDENTIFIER ::= { cephConformance 1 }
cephCompliances   OBJECT IDENTIFIER ::= { cephConformance 2 }

-- ---------------------------------------------------------- --
-- units of conformance
-- ---------------------------------------------------------- --

-- ---------------------------------------------------------- --
-- The Trap Notification Group
-- ---------------------------------------------------------- --

cephNotificationGroup NOTIFICATION-GROUP
    NOTIFICATIONS {
        promGenericNotification,
        promGenericDaemonCrash,
        promHealthStatusError,
        promHealthStatusWarning,
        promMonLowQuorum,
        promMonDiskSpaceCritical,
        promOsdDownHigh,
        promOsdDown,
        promOsdNearFull,
        promOsdFlapping,
        promOsdHighPgDeviation,
        promOsdFull,
        promOsdHighPredictedFailures,
        promOsdHostDown,
        promMdsDamaged,
        promMdsReadOnly,
        promMdsOffline,
        promMdsDegraded,
        promMdsNoStandby,
        promMgrModuleCrash,
        promMgrPrometheusInactive,
        promPGsInactive,
        promPGsUnclean,
        promPGsUnavailable,
        promPGsDamaged,
        promPGsRecoveryFull,
        promPGsBackfillFull,
        promNodeRootVolumeFull,
        promNodeNetworkPacketDrops,
        promNodeNetworkPacketErrors,
        promNodeStorageFilling,
        promPoolFull,
        promPoolFilling,
        promRadosUnfound,
        promRadosRBDMirrorImagesVeryHigh,
        promRadosRBDMirrorUnsyncImages,
        promRadosRBDMirrorUnsyncImagesHigh,
        promRadosRBDMirrorHighBandwidth,
        promCephadmDaemonDown,
        promCephadmUpgradeFailure,
        promPrometheusJobMissing,
        promNVMeGatewayNicDown
    }
    STATUS current
    DESCRIPTION
        "A collection of notifications triggered by the Prometheus
        rules to convey Ceph cluster state"
    ::= { cephAlertGroups 2 }

-- ---------------------------------------------------------- --
-- compliance statements
-- ---------------------------------------------------------- --

cephCompliance MODULE-COMPLIANCE
    STATUS current
    DESCRIPTION
        "The Compliance statement for the Ceph MIB"
    MODULE
        MANDATORY-GROUPS {
            cephNotificationGroup
        }
    ::= { cephCompliances 1 }

END
