4月 172013
 

……

在监控主机端安装check_nrpe插件

[root@monitor ~]# tar xzvf nrpe-2.14.tar.gz
[root@monitor ~]# cd nrpe-2.14
[root@monitor nrpe-2.14]# ./configure
[root@monitor nrpe-2.14]# make all
[root@monitor nrpe-2.14]# make install-plugin
[root@monitor nrpe-2.14]# ls /usr/local/nagios/libexec/check_nrpe
 /usr/local/nagios/libexec/check_nrpe
[root@monitor nrpe-2.14]#

在被监控主机端安装nagios plugin与nrpe daemon

[root@localhost nrpe-2.14]# useradd nagios -s /sbin/nologin

安装nagios plugin

[root@localhost nagios-plugins-1.4.16]# ./configure
[root@localhost nagios-plugins-1.4.16]# make
[root@localhost nagios-plugins-1.4.16]# make install
[root@localhost nagios-plugins-1.4.16]# chown -R nagios.nagios /usr/local/nagios/

[root@localhost nrpe-2.14]# yum install gcc make openssl-devel

[root@localhost ~]# yum install xinetd

[root@localhost nrpe-2.14]# ./configure
[root@localhost nrpe-2.14]# make
[root@localhost nrpe-2.14]# make install
[root@localhost nrpe-2.14]# make install-daemon
[root@localhost nrpe-2.14]# make install-daemon-config
[root@localhost nrpe-2.14]# make install-xinetd

修改配置文件中NRPE监听的监控主机IP地址
[root@localhost nrpe-2.14]# vi /etc/xinetd.d/nrpe
only_from       = 127.0.0.1

[root@localhost nrpe-2.14]# vi /etc/services
nrpe            5666/tcp                # NRPE Daemon

[root@localhost nrpe-2.14]# service xinetd start
Starting xinetd:                                           [  OK  ]
[root@localhost nrpe-2.14]#

[root@localhost nrpe-2.14]# netstat -at |grep nrpe
tcp        0      0 *:nrpe                      *:*                         LISTEN
[root@localhost nrpe-2.14]#

[root@localhost nrpe-2.14]# /usr/local/nagios/libexec/check_nrpe -H localhost
NRPE v2.14
[root@localhost nrpe-2.14]#

在监控主机使用命令检测被监控主机当前登录用户数量

[root@monitor libexec]# ./check_nrpe -H 192.168.1.90 -c check_users
USERS OK – 1 users currently logged in |users=1;5;10;0
[root@monitor libexec]#

使用NRPE需要定义被监控主机所使用的命令
[root@localhost nrpe-2.14]# vi /usr/local/nagios/etc/nrpe.cfg
command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10
command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20
command[check_hda1]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /dev/hda1
command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200

在监控主机创建check_nrpe命令定义

# ‘check_nrpe’ command definition
define command{
command_name    check_nrpe
command_line    $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}

增加被监控主机的监控服务定义

[root@monitor etc]# vi servers/szvs-v01.cfg
define service{
use                             generic-service
host_name                       szvs-v01
service_description             Current Users
check_command                   check_nrpe!check_users
}

define service{
use                             generic-service
host_name                       szvs-v01
service_description             CPU Load
check_command                   check_nrpe!check_load
}

nagios-nrpe-install-01 nagios-nrpe-install-02

相关下载:
(1) NRPE 2.14

4月 172013
 

确定将要使用的在templates.cfg已定义的主机模板

在主配置文件中取消注释以下内容

 #cfg_dir=/usr/local/nagios/etc/servers

创建目录

[root@monitor objects]# mkdir ../servers
 [root@monitor objects]# chown -R nagios.nagios ../servers/
 [root@monitor objects]# chmod -R 775 ../servers/

创建主机组配置文件szvs.cfg并添加主机

define hostgroup{
 hostgroup_name  v-servers
 alias           Virtualization Servers
 members         szvs-v01
 }

创建主机配置文件szvs-v01.cfg

定义主机部分,使用linux-server主机模板

define host{
 use                     linux-server
 host_name               szvs-v01
 alias                   app
 address                 192.168.1.90
 }

定义服务部分

引用的服务模板generic-service来自模板配置文件

define service{
 use                             generic-service
 host_name                       szvs-v01
 service_description             PING
 check_command                   check_ping!100.0,20%!500.0,60%
 notifications_enabled           0
 }
define service{
 use                             generic-service
 host_name                       szvs-v01
 service_description             HTTP
 check_command                   check_http
 notifications_enabled           0
 }

检测当前配置文件正确性

[root@monitor objects]# ../../bin/nagios -v ../nagios.cfg

Nagios Core 3.5.0
Copyright (c) 2009-2011 Nagios Core Development Team and Community Contributors
Copyright (c) 1999-2009 Ethan Galstad
Last Modified: 03-15-2013
License: GPL

Website: http://www.nagios.org
Reading configuration data…
Read main config file okay…
Processing object config file ‘/usr/local/nagios/etc/objects/commands.cfg’…
Processing object config file ‘/usr/local/nagios/etc/objects/contacts.cfg’…
Processing object config file ‘/usr/local/nagios/etc/objects/timeperiods.cfg’…
Processing object config file ‘/usr/local/nagios/etc/objects/templates.cfg’…
Processing object config file ‘/usr/local/nagios/etc/objects/localhost.cfg’…
Processing object config directory ‘/usr/local/nagios/etc/servers’…
Processing object config file ‘/usr/local/nagios/etc/servers/szvs.cfg’…
Processing object config file ‘/usr/local/nagios/etc/servers/szvs-v01.cfg’…
Read object config files okay…

Running pre-flight check on configuration data…

Checking services…
Checked 9 services.
Checking hosts…
Checked 2 hosts.
Checking host groups…
Checked 2 host groups.
Checking service groups…
Checked 0 service groups.
Checking contacts…
Checked 1 contacts.
Checking contact groups…
Checked 1 contact groups.
Checking service escalations…
Checked 0 service escalations.
Checking service dependencies…
Checked 0 service dependencies.
Checking host escalations…
Checked 0 host escalations.
Checking host dependencies…
Checked 0 host dependencies.
Checking commands…
Checked 24 commands.
Checking time periods…
Checked 5 time periods.
Checking for circular paths between hosts…
Checking for circular host and service dependencies…
Checking global event handlers…
Checking obsessive compulsive processor commands…
Checking misc settings…

Total Warnings: 0
Total Errors:   0

Things look okay – No serious problems were detected during the pre-flight check
[root@monitor objects]#

重启nagios服务

[root@monitor objects]# service nagios restart
 Running configuration check...done.
 Stopping nagios: .done.
 Starting nagios:This account is currently not available.
 done.
 [root@monitor objects]#

查看重启以后的nagios日志信息

nagios-config-01-01

 

查看主机地图

nagios-config-01-02

查看主机列表

nagios-config-01-03

查看服务列表

nagios-config-01-04

 

 

查看主机组列表

nagios-config-01-05

检测项目的任务队列

首次添加的任务显示为PENDING状态,等待检测

nagios-config-01-06

查看任务队列中的检测时间排列

nagios-config-01-07

成功执行新添加的HTTP服务检测nagios-config-01-08

4月 172013
 

……

nagios-monitor-01

主配置文件nagios.cfg默认引用的对象配置文件

cfg_file=/usr/local/nagios/etc/objects/commands.cfg
cfg_file=/usr/local/nagios/etc/objects/contacts.cfg
cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg
cfg_file=/usr/local/nagios/etc/objects/templates.cfg
cfg_file=/usr/local/nagios/etc/objects/localhost.cfg

Localhost监控对象配置文件分析

定义主机

define host{
        use                     linux-server
        host_name               localhost
        alias                   localhost
        address                 127.0.0.1
        }

use 定义当前主机使用的主机模板,引用在templates.cfg中已定义的linux-server主机模板
hostname 定义显示在nagios web管理界面中的主机名称
alias 别名,主机名的完整描述
address 定义当前主机IP地址

定义主机组

 define hostgroup{
 hostgroup_name  linux-servers
 alias           Linux Servers
 members         localhost
 }

主机组用来定相似服务类型或处于同一地域的一组主机
hostgroup_name 定义当前主机组名称
alias 主机组别名,主机组名称的完整描述
members 定义当前主机组中包含的主机,使用已定义主机名称并使用逗号分隔

定义服务(定义具体的监控项目)

监控已定义主机localhost的ping响应
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             PING
check_command                   check_ping!100.0,20%!500.0,60%
}

use 引用在templates.cfg中已定义的服务模板
hostname 指定启用此监控项目的已定义的主机名
service_description 显示在nagios web界面的服务名称
check_command

配置文件中检测命令的完整工作过程

查看

[root@monitor objects]# pwd
 /usr/local/nagios/etc/objects
[root@monitor objects]# ls ../../libexec/check_ping
 ../../libexec/check_ping
[root@monitor objects]#
[root@monitor objects]# ../../libexec/check_ping
 check_ping: Could not parse arguments
 Usage:
 check_ping -H <host_address> -w <wrta>,<wpl>% -c <crta>,<cpl>%
[-p packets] [-t timeout] [-4|-6]
[root@monitor objects]#

-w 指定警告数值和百分比
-c 指定临界数值和百分比

nagios检测命令的完整格式及返回结果

[root@monitor objects]# ../../libexec/check_ping -H localhost -w 100.0,20% -c 500.0,60%
 PING OK - Packet loss = 0%, RTA = 0.05 ms|rta=0.046000ms;100.000000;500.000000;0.000000 pl=0%;20;60;0
 [root@monitor objects]#

响应时间达到或超过100毫秒进入警告状态,响应时间达到或超过500毫秒进入临界状态
检测得到的实际响应时间为0.05毫秒

监控已定义主机的localhost的根分区(/)可用磁盘容量

define service{
 use                             local-service         ; Name of service template to use
 host_name                       localhost
 service_description             Root Partition
 check_command                   check_local_disk!20%!10%!/
 }

check_command中的check_local_disk实际为引用命令定义文件commands.cfg已定义命令名称
# ‘check_local_disk’ command definition
define command{
command_name    check_local_disk
command_line    $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
}

nagios检测命令的完整格式及返回结果
[root@monitor libexec]# ./check_disk
check_disk: Could not parse arguments
Usage:
check_disk -w limit -c limit [-W limit] [-K limit] {-p path | -x device}
[-C] [-E] [-e] [-g group ] [-k] [-l] [-M] [-m] [-R path ] [-r path ]
[-t timeout] [-u unit] [-v] [-X type]
[root@monitor libexec]# ./check_disk -w 20% -c 10% -p /
DISK OK – free space: / 45970 MB (96% inode=98%);| /=1866MB;40316;45356;0;50396
[root@monitor libexec]#

监控已定义主机的localhost的当前登录用户
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             Current Users
check_command                   check_local_users!20!50
}

监控已定义主机的localhost的进程数量
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             Total Processes
check_command                   check_local_procs!250!400!RSZDT
}

监控已定义主机的localhost的负载状态
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             Current Load
check_command                   check_local_load!5.0,4.0,3.0!10.0,6.0,4.0
}

监控已定义主机的localhost的磁盘交换空间使用状态
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             Swap Usage
check_command                   check_local_swap!20!10
}

监控已定义主机的localhost的ssh服务或端口开启状态
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             SSH
check_command                   check_ssh
notifications_enabled           0
}

监控已定义主机的localhost的web服务或80端口状态
define service{
use                             local-service         ; Name of service template to use
host_name                       localhost
service_description             HTTP
check_command                   check_http
notifications_enabled           0
}