%CODE{"sh" num="1"}% # SCRIPT: Cluster Monitoring Script # Date..: Mon 20 Oct 2008 02:48:13 PM BRST # Author: Jadir Marra da Silva<jadir.silva13@gmail.com> # ##################################################### # Jadir Silva: # Mon 02 Nov 2008 # + acrescentado o teste para verificar o espaco no /scratch # do nodes conforme sugerido por Sergio Lietti.

# Jadir Silva: # Mon 03 Nov 2008 09:47:20 AM BRST # + acrescentado uma definicão condicional para MAILTO, # MAILSUBJECT e CARBON_COPY para evitar o envio de emails # quando estiver depurando o script

# Jadir Silva: # Mon 10 Nov 2008 11:21:26 AM BRST # + acrescentado alerta via email para o site verify,

# Jadir Silva: # Tue 11 Nov 2008 09:09:13 AM BRST # + acrescentado comando para remover o arquivo temporario com # o conteudo do email de alerta.

# Jadir Silva: # Tue 11 Nov 2008 12:43:04 PM BRST # + alterado o IDLE_THRESHOLD de 699 para 999 # seguindo orientacao do Lietti.

# Jadir Silva: # Thu 27 Nov 2008 10:22:34 AM BRST # + correcao de pequeno problema que impedia # o envio de email com numero de nodes com o condor down.

TIER="T2_BR_SPRACE" SEND_MAIL="N" GRID_USER="mdias" MAIL_BODY=`mktemp /tmp/site_verify.XXXXXXX` MAILFROM="root@osg-ce.sprace.org.br" MAILTO="sprace_ops@googlegroups.com" CARBON_COPY="jadir.silva13@gmail.com" MAILSUBJECT="SPRACE - Monitoramento Automatico - `date`"

STATUS_PAGE="/var/www/html/spracemon.html" MAIN_SERVERS="acs.grid osgce.grid osgse.grid storage01.grid storage02.grid" LOAD_THRESHOLD=9 SERVERS_LOAD_THRESHOLD=14 POOL_THRESHOLD=90 IDLE_THRESHOLD=999 #GANGLIA_LINK='http://prod-frontend.hepgrid.uerj.br/ganglia/' GANGLIA_LINK='http://osg-ce.sprace.org.br/ganglia' DCACHE_URL="http://osg-se.sprace.org.br:2288" #DCACHE_URL="http://cdfdca.fnal.gov:2288/cellInfo"

# Captura uma lista com todos os nodes do cluster NODE_LIST=`links -source $GANGLIA_LINK | grep 'OPTION.*\.grid' | sed 's/<[^>]*>/ /g'` NODE_LIST=`echo $NODE_LIST | sed 's/\.grid//g'` NODE_LIST=`echo $NODE_LIST | sed 's/osgce//g;s/storage01//g;s/storage02//g;s/osgse//g;s/acs//g'`

NODE_LIST=`cat /root/bin/cluster.list`

PHEDEX_PROD_URL="http://cmsweb.cern.ch/phedex/prod/Components::Status" PHEDEX_DEBG_URL="http://cmsweb.cern.ch/phedex/debug/Components::Status"

This is an automatic email, please do not reply
Message send in `date`"

function Header(){ echo "

$1</h2" >> $MAIL_BODY echo "
" >> $MAIL_BODY }

function OpenTable(){ echo "

" >> $MAIL_BODY }

function CloseTable(){ echo "

" >> $MAIL_BODY }

function OpenTbLine(){ echo "" >> $MAIL_BODY }

function CloseTbLine(){ echo "" >> $MAIL_BODY }

function OpenCell(){ echo '' >> $MAIL_BODY }

function CloseCell(){ echo "" >> $MAIL_BODY }

function Link(){ echo "For details "'click here' >> $MAIL_BODY }

function WriteLn(){ echo "$1
" >> $MAIL_BODY }

function Write(){ echo "$1 " >> $MAIL_BODY }

function WriteStatusPage(){ cp $MAIL_BODY $STATUS_PAGE }

function InitMail(){ echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "SPRACE Monitoring script" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "" >> $MAIL_BODY echo "

SPRACE Monitoring Report

" >> $MAIL_BODY echo "

Test start at `date`

" >> $MAIL_BODY echo " http://osg-ce.sprace.org.br/banner-unico-aa.png);background-repeat: no-repeat\">" >> $MAIL_BODY }

function CloseMail(){ echo "

Test done at `date`

" >> $MAIL_BODY echo "
" >> $MAIL_BODY echo "Report generated by monitor.sh script, developed by Jadir Silva with support of Allan Szu
" >> $MAIL_BODY echo "and some suggestions from Sergio Lietti following steps defined by Marco Dias in " >> $MAIL_BODY echo "[1]." >> $MAIL_BODY echo "

Obs.: This script still under development, if you have any opinion,
" >> $MAIL_BODY echo "contact me at jadir.silva13@gmail.com
" >> $MAIL_BODY echo " " >> $MAIL_BODY echo "" >> $MAIL_BODY }

function SendMail(){ cat - $1 <<HERE | /usr/sbin/sendmail -oi -t From: $2 To: $3 Cc: $4 Subject: $5 Content-Type: text/html; charset=us-ascii Content-Transfer-Encoding: 7bit MIME-Version: 1.0


SendMsgToTeam(){ if [ "`basename $0`" == "monitor_debug.sh" ];then MAILSUBJECT="SPRACE - Debug" CARBON_COPY="jadirmarra@yahoo.com.br" MAILTO="jadir.silva13@gmail.com" fi

ReportMail=`mktemp /tmp/ReportMail.XXXXXX` echo "$MSG" > $ReportMail SendMail "$ReportMail" "$MAILFROM" "$MAILTO" "$CARBON_COPY" "$MAILSUBJECT" #rm -fr $ReportMail }

function ReportToTeam(){ case "$1" in idle) MSG="$MSG

Lot of Jobs in idle state

There are $2 in idle state on the farm.
" ;; sam-error) MSG="$MSG

Error on SAM test

The SAM test presents some errors $2.
" ;; dcache) MSG="$MSG

Error in some services of dcache

DCache have $2 stopped services.
" ;; dpool) MSG="$MSG

Low space on some pools in dcache

DCache has some pools with $2 of space used.
" ;; server-load) MSG="$MSG

Load of main servers

" ;; node-load) MSG="$MSG

Load dos nodes

$2 beyond $LOAD_THRESHOLD.
" ;; node_down) MSG="$MSG

Node Down

$2 down.
$FOOTER" ;; condor_down) MSG="$MSG

Condor Down

$2 with condor stopped.
" ;; low_disk) MSG="$MSG

Low space on scratch in following nodes

" ;; job_robot) MSG="$MSG

JobRobot with low efficiency

" ;; old_jobs) MSG="$MSG

Jobs more than 2 days on the farm

" ;; site_verify) MSG="$MSG

Site verify failed.

" ;; phedex_down) MSG="$MSG

Phedex Agents status.

" ;; esac }

function JobRobotTest(){ JOB_ROBOT_URL1="http://belforte.home.cern.ch/belforte/JobRobot/summary_"`date '+%y%m%d' -d "$1 day ago"`".html" JOBROBOT1=`links -source $JOB_ROBOT_URL1` POSITION=`echo "$JOBROBOT1" | grep -n ' T2_BR_SPRACE' | awk -F: '{print $1}'`

if [ "$POSITION" = "" ];then SED_DATA="$POSITION,$((POSITION+5))p" JOBROBOT1=`echo "$JOBROBOT1" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'` EFICIENCIA=`echo $JOBROBOT1 | awk '{print $6}'`

if [ "$EFICIENCIA" == "" ];then WriteLn "Efficiency : -- -- --" else if [ $EFICIENCIA -lt 60 ];then if [ "$1" == "1" ];then ReportToTeam "job_robot" "$EFICIENCIA" fi EFI="Efficiency : $EFICIENCIA% (below expected) (Test done at "`date '+%d/%m/%Y' -d "$1 day ago"`")." WriteLn "$EFI" else EFI="Efficiency : $EFICIENCIA% Ok (Test done at "`date '+%d/%m/%Y' -d "$1 day ago"`")." WriteLn "$EFI" fi fi else WriteLn "Efficiency : -- -- --" fi }

# inicializa o ambiente do OSG source /OSG/setup.sh

InitMail ##################################################### # espaco reservado para testes nao digite nada aqui


# PASSO 1. verifica quantos nodes estao down Header "Hosts down"

HDOWN=`links -source $GANGLIA_LINK | grep 'class=down' | sed 's/<[^>]*>/ /g' | awk '{ print $1,"|" }'` HDOWN=`echo $HDOWN | sed 's/\.grid//g'`


if [ "$HDOWN" == "" ];then WriteLn "No hosts down." else IFS='|' for host in $HDOWN;do IFS=$OLD_IFS NODE_NAME=`echo $host | awk '{print $1}'` IFS='|' Write "$NODE_NAME  " UP_LIST=`echo $UP_LIST | sed "s/$NODE_NAME/ /g"` Link "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$NODE_NAME.grid" Write "
" done ReportToTeam "node_down" "$HDOWN" fi


# PASSO 2. verifica os host's que estao load > 10 Header "Hosts with load equal/above $((LOAD_THRESHOLD+1))" NADA=0 LOADS="" for a in $UP_LIST;do node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g'` LOAD=`echo $node_load | awk '{print $1}'` LOAD=`echo $LOAD | sed 's/\.[0-9][0-9]//g'` if [ $LOAD -gt $LOAD_THRESHOLD ];then WriteLn "$a load : $LOAD" LOADS="$LOADS
$a(load=$LOAD)" NADA=1 fi done

if [ $NADA == 0 ];then WriteLn "No host with load equal/above $((LOAD_THRESHOLD+1))." NADA=0 else ReportToTeam "node-load" "$MSG" fi

NADA=0 Header "Load of main servers

# PASSO 3. Load acima de 15 nos principais servidores. LOAD_NODE="" for a in $MAIN_SERVERS ;do # node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g' | grep '[1-9][5-9]\.'` node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g'` LOAD=`echo $node_load | awk '{print $1}'` LOAD=`echo $LOAD | sed 's/\.[0-9][0-9]//g'` if [ $LOAD -gt $SERVERS_LOAD_THRESHOLD ];then LOAD_NODE="$a(load=$LOAD)" WriteLn "$a load : $LOAD" NADA=1 fi done

if [ $NADA == 0 ];then WriteLn "No host with load equal/above $((SERVERS_LOAD_THRESHOLD+1))." NADA=0 else ReportToTeam "server-load" "$LOAD_NODE" fi

# PASSO 4. # Site verify # executa o script site_verify.pl da OSG. SITE_VERIFY=`su - $GRID_USER -c "source /opt/osg-1.0.0/setup.sh;/opt/osg-1.0.0/verify/site_verify.pl" | grep -i " FAIL"`

Header "Siteverify.pl status"

if [ "$SITE_VERIFY" == "" ];then WriteLn "Site verify test: SUCCESS" else WriteLn "Errors founded:" WriteLn "$SITE_VERIFY" ReportToTeam "site_verify" "$SITE_VERIFY" fi

# verifica se o condor esta rodando em todos os nos Header "Condor status" CRUNNING=0 CSTOPPED=0 CONDOR_MSG="" for node in $UP_LIST;do IS_RUNNING=`ssh $node ps -fu condor | grep -v UID` if [ "$IS_RUNNING" == "" ];then WriteLn "$node with condor stopped." CSTOPPED=$((CSTOPPED+1)) CONDOR_MSG="$CONDOR_MSG
$node" else CRUNNING=$((CRUNNING+1)) fi


if [ "$CSTOPPED" == "0" ];then WriteLn "Condor running on all active nodes" else ReportToTeam "condor_down" "$CONDOR_MSG" WriteLn "Condor running on $CRUNNING nodes and stopped on $CSTOPPED nodes." fi

Header "Job status" JOBS_STATS=`condor_q | grep running` IDLE_JOBS=`echo $JOBS_STATS | awk '{print $3}'` TOTAL_JOBS=`echo $JOBS_STATS | awk '{print $1}'` RUN_JOBS=`echo $JOBS_STATS | awk '{print $5}'` HELD_JOBS=`echo $JOBS_STATS | awk '{print $7}'`

OpenTable OpenTbLine OpenCell WriteLn "Running: $RUN_JOBS" if [ $IDLE_JOBS -gt $IDLE_THRESHOLD ];then WriteLn "Idle.......: $IDLE_JOBS Warning!!!" ReportToTeam "idle" "$IDLE_JOBS" else WriteLn "Idle.......: $IDLE_JOBS" fi WriteLn "Held.......: $HELD_JOBS" WriteLn "Total......: $TOTAL_JOBS" CloseCell OpenCell WriteLn "     &nbsp" CloseCell OpenCell WriteLn "If has any job held or more than $((IDLE_THRESHOLD+1)) jobs in idle
Please report to sprace_ops@yahoo.com.br" CloseCell CloseTbLine CloseTable

Header "Jobs with more than 2 days on the farm" JOBS_RUNNING=`condor_q -run | grep [2-9]+ ` if [ "$JOBS_RUNNING" == "" ];then WriteLn "No jobs more than 2 days on the farm" else MORE2DAYS=`echo $JOBS_RUNNING | sed 's/\.grid/\.grid
/g'` WriteLn "$MORE2DAYS" ReportToTeam "old_jobs" "$MORE2DAYS" fi

Header "Farm occupation" FarmOcupation=`condor_q -run | grep -v "OWNER" | grep -v "Submitter" | awk '{print $2}' | sort | uniq -c | sed 1d` WriteLn "


Header "SAM test"

SAM=`links -source "http://dashb-cms-sam.cern.ch/dashboard/request.py/latestresultssmry?siteSelect3=T2T1T0&serviceTypeSelect3=vo&sites=T2_BR_SPRACE&services=CE&services=SRMv2&tests=1301&tests=133&tests=111&tests=6&tests=1261&tests=76&tests=64&tests=20&tests=281&tests=882&exitStatus=all" | sed -e '/latestresultssmrytable/!d' | awk '{ print substr($0,index($0,"latestresultssmrytable")) }' | sed 's/target=\"\_blank\">//g'`

SAM_LINK='"http://dashb-cms-sam.cern.ch/dashboard/request.py/'$SAM SAM_LINK=`echo $SAM_LINK | sed 's/\"//g'` SAM_TABLE=`links -source $SAM_LINK | sed 's/\/dashboard/http\:\/\/dashb-cms-sam\.cern\.ch\/dashboard/g'` SAM_TABLE=`echo $SAM_TABLE | sed 's/SAM-Latest Results<\/title><\/head><body>//g'` SAM_TABLE=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"<link rel")) }'` SAM_TABLE=`echo $SAM_TABLE | sed 's/<\/body><\/html>'//g` SAM_TABLE=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"<div"))}'` #SAM_TABLE=`echo $SAM_TABLE | sed 's/Service Type/Tipo de Servico/g;s/Service Name/Nome do Servico/g;s/Sitename/Sitio/g'` <p /> SAM_RED1_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF0000"),24) }' ` SAM_RED2_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF6666"),24) }' ` SAM_RED3_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF9999"),24) }' ` <p /> SAM_CRIT1_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#CC00CC"),24) }' ` SAM_CRIT2_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF33FF"),24) }' ` SAM_CRIT3_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF99FF"),24) }' ` <p /> if [ "$SAM_RED1_ERROR" == "background-color:#FF0000" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "sam-error" "Dark red code" fi <p /> if [ "$SAM_RED2_ERROR" == "background-color:#FF6666" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "sam-error" "Light red code" fi <p /> if [ "$SAM_RED3_ERROR" == "background-color:#FF9999" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "sam-error" "Lightest red code" fi <p /> if [ "$SAM_CRIT1_ERROR" == "background-color:#CC00CC" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "sam-error" "Dark critical code" fi <p /> if [ "$SAM_CRIT2_ERROR" == "background-color:#FF33FF" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "sam-error" "Light critical code" fi <p /> if [ "$SAM_CRIT3_ERROR" == "background-color:#FF99FF" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "sam-error" "Lightest critical code" fi <p /> echo "$SAM_TABLE" >> $MAIL_BODY <p /> # Verifica os servicos do d-cache Header "DCache status" <p /> CELL_INFO=`links -source $DCACHE_URL/cellinfo` USAG_INFO=`links -source $DCACHE_URL/usageInfo` CELL_STAT=`echo "$CELL_INFO" | sed 's/<[^>]*>/ /g' | grep -i offline | wc -l` USAG_STAT=`echo "$USAG_INFO" | sed 's/<[^>]*>/ /g'` <p /> if [ $CELL_STAT -gt 0 ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "$CELL_STAT dcache services are stopped" <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "dcache" "$CELL_STAT" else <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "All dcache services(<i>daemons</i>) ok.<br />" fi <p /> LINE="" TOTAL=0 FREE=0 REPORTAR="" echo "$USAG_INFO" | egrep '(cell|total|free|precious)' | grep '<td' | grep -v 'layout' | sed 's/<td class=\"//g;s/\">/ /g;s/<\/td>//g' | while read celula valor ;do <p /> if [ "$celula" == "cell" ];then LINE="$LINE $valor" fi <p /> if [ "$celula" == "total" ];then TOTAL="$valor" fi <p /> if [ "$celula" == "free" ];then FREE="$valor" fi <p /> if [ "$celula" == "precious" ];then PERCENT=`cat - << HERE | bc scale=0 100-((100*$FREE)/$TOTAL) HERE` <p /> if [ $PERCENT -gt $POOL_THRESHOLD ];then REPORTAR="$REPORTAR <br />$LINE with $PERCENT occupation" PERCENT="<font color=#ff0000>$PERCENT%</font>" else PERCENT="$PERCENT%" fi <p /> <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "$LINE with $PERCENT ocuppation" LINE="" fi done <p /> if [ "$REPORTAR" = "" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "dpool" "$REPORTAR" fi <p /> ####################### # Lietti sugeriu acrescentar no relatorio o espaco ocupado pelo/scratch dos nodes # entao desenvolvi o codigo abaixo para fazer isto. x=0 Header "Ocuppation of /scratch on nodes" <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "<h3>Only nodes less than 8Gb.</h3>" IFS=$OLD_IFS LOW_DISK_NODES="" nodes_to_save="" for node in $UP_LIST;do if [ "$node" = "`/bin/hostname -s`" ]; then eval "df -h /scratch" else saida=`ssh $node "df /scratch"` size=`echo $saida | awk '{print $11}'` perc=`echo $saida | awk '{print $12}'` if [ $size -lt 8388608 ];then x=$(($x+1)) if [ $size -lt 1048576 ];then node_disk_space="$node"'('"$(($size/1024))Mb"') ' <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "$node_disk_space" LOW_DISK_NODES="$LOW_DISK_NODES $node_disk_space<br />" nodes_to_save="$nodes_to_save $node" else node_disk_space="$node"'('"$(($size/1048576))Gb"') ' <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "$node_disk_space" LOW_DISK_NODES="$LOW_DISK_NODES $node_disk_space<br />" nodes_to_save="$nodes_to_save $node" fi fi fi done <p /> if [ "$LOW_DISK_NODES" = "" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "low_disk" "$LOW_DISK_NODES" echo "$nodes_to_save" > /tmp/nodes_full.txt else <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "No node with low space on /scratch" fi <p /> <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "" <p /> Header "JobRobot Status" <p /> <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/JobRobotTest?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="JobRobotTest (this topic does not yet exist; you can create it)">JobRobotTest</a></span> 1 <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/JobRobotTest?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="JobRobotTest (this topic does not yet exist; you can create it)">JobRobotTest</a></span> 2 <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/JobRobotTest?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="JobRobotTest (this topic does not yet exist; you can create it)">JobRobotTest</a></span> 3 <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/JobRobotTest?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="JobRobotTest (this topic does not yet exist; you can create it)">JobRobotTest</a></span> 4 <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/JobRobotTest?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="JobRobotTest (this topic does not yet exist; you can create it)">JobRobotTest</a></span> 5 <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/JobRobotTest?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="JobRobotTest (this topic does not yet exist; you can create it)">JobRobotTest</a></span> 6 <p /> if [ "$DEBUG" == "monitor_debug.sh" ];then # codigo de teste ou instavel deve ficar aqui #################### Header "CEMon Status" ldap=`ldapsearch -x -LLL -p 2170 -h is.grid.iu.edu -b mds-vo-name=SPRACE,mds-vo-name=local,o=grid` cods=`ssh node34 "source /OSG/setup.sh;condor_status -pool osg-ress-1.fnal.gov -l -constraint 'GlueCEInfoHostName == \"osg-ce.sprace.org.br\"'"` <p /> LDAP_STATUS=`echo $ldap | grep ` <p /> # fim da area de teste ########################################### fi <p /> # status dos agentes do phedex Header "Phedex Agents Status" <p /> PRODUCTION_STATUS=`links -source $PHEDEX_PROD_URL` DEBUG_STATUS=`links -source $PHEDEX_DEBG_URL` <p /> PROD_POSITION=`echo "$PRODUCTION_STATUS" | grep -n "$TIER" | awk -F: '{print $1}'` DEBG_POSITION=`echo "$DEBUG_STATUS" | grep -n "$TIER" | awk -F: '{print $1}'` <p /> TBL_CODE="" TBL_CODE2="" <p /> if [ "$PROD_POSITION" = "" ];then SED_DATA="$((PROD_POSITION+1)),$((PROD_POSITION+2))p" PROD_CODE=`echo "$PRODUCTION_STATUS" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'` TBL_CODE=`echo "$PRODUCTION_STATUS" | sed -n $SED_DATA` <p /> IS_AGENT_DOWN=`echo "$PROD_CODE" | grep DOWN` <p /> if [ "$IS_AGENT_DOWN" = "" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "phedex_down" "Phedex(production) agent down" <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "Phedex(production) agents down." else <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "Phedex(production) agents are OK." fi fi <p /> <p /> if [ "$DEBG_POSITION" = "" ];then SED_DATA="$((DEBG_POSITION+1)),$((DEBG_POSITION+2))p" DEBG_CODE=`echo "$DEBUG_STATUS" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'` TBL_CODE2=`echo "$DEBUG_STATUS" | sed -n $SED_DATA` <p /> IS_AGENT_DOWN=`echo "$DEBG_CODE" | grep DOWN` <p /> if [ "$IS_AGENT_DOWN" = "" ];then <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/ReportToTeam?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="ReportToTeam (this topic does not yet exist; you can create it)">ReportToTeam</a></span> "phedex_down" "Phedex(debug) agent down" <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "Phedex(debug) agents down." else <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteLn?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteLn (this topic does not yet exist; you can create it)">WriteLn</a></span> "Phedex(debug) agents are OK." fi fi <p /> Write "<table><tr><td>Production Agents</td>$TBL_CODE</tr><tr><td>Debug Agents</td>$TBL_CODE2</tr></table>" <p /> <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/CloseMail?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="CloseMail (this topic does not yet exist; you can create it)">CloseMail</a></span> <span class="twikiNewLink"><a href="https://sprace.org.br/twiki/bin/edit/Main/WriteStatusPage?topicparent=Main.TestLink;nowysiwyg=0" rel="nofollow" title="WriteStatusPage (this topic does not yet exist; 