os: ubuntu 16.04
db: postgresql 9.6
使用 patroni 做的 pgsql 高可用,发现 pgsql 无法启动,查看 syslog 时报错
# tail -n 200 /var/log/syslog
Jun 19 15:43:54 PGNODE1 patroni[66763]: 2019-06-19 15:43:54.374 CST [73241] FATAL: could not create semaphores: No space left on device
Jun 19 15:43:54 PGNODE1 patroni[66763]: 2019-06-19 15:43:54.374 CST [73241] DETAIL: Failed system call was semget(5432001, 17, 03600).
Jun 19 15:43:54 PGNODE1 patroni[66763]: 2019-06-19 15:43:54.374 CST [73241] HINT: This error does *not* mean that you have run out of disk space. It occurs when either the system limit for the maximum number of semaphore sets (SEMMNI), or the system wide maximum number of semaphores (SEMMNS), would be exceeded. You need to raise the respective kernel parameter. Alternatively, reduce PostgreSQL's consumption of semaphores by reducing its max_connections parameter.
Jun 19 15:43:54 PGNODE1 patroni[66763]: #011The PostgreSQL documentation contains more information about configuring your system for PostgreSQL.
Jun 19 15:43:54 PGNODE1 patroni[66763]: 2019-06-19 15:43:54.387 CST [73241] LOG: database system is shut down
Jun 19 15:43:54 PGNODE1 patroni[66763]: localhost:5432 - no response
Jun 19 15:43:55 PGNODE1 patroni[66763]: 2019-06-19 15:43:55.955 CST [73248] FATAL: could not create semaphores: No space left on device
Jun 19 15:43:55 PGNODE1 patroni[66763]: 2019-06-19 15:43:55.955 CST [73248] DETAIL: Failed system call was semget(5432001, 17, 03600).
Jun 19 15:43:55 PGNODE1 patroni[66763]: 2019-06-19 15:43:55.955 CST [73248] HINT: This error does *not* mean that you have run out of disk space. It occurs when either the system limit for the maximum number of semaphore sets (SEMMNI), or the system wide maximum number of semaphores (SEMMNS), would be exceeded. You need to raise the respective kernel parameter. Alternatively, reduce PostgreSQL's consumption of semaphores by reducing its max_connections parameter.
Jun 19 15:43:55 PGNODE1 patroni[66763]: #011The PostgreSQL documentation contains more information about configuring your system for PostgreSQL.
Jun 19 15:43:55 PGNODE1 patroni[66763]: 2019-06-19 15:43:55.968 CST [73248] LOG: database system is shut down
Jun 19 15:43:55 PGNODE1 patroni[66763]: localhost:5432 - no response
提示 sem 不够用,查看系统设置
# sysctl -a |grep -i kernel.sem
kernel.sem = 32000 1024000000 500 32000
# ipcs -l
------ Messages Limits --------
max queues system wide = 32000
max size of message (bytes) = 8192
default max size of queue (bytes) = 16384
------ Shared Memory Limits --------
max number of segments = 4096
max seg size (kbytes) = 18014398509465599
max total shared memory (kbytes) = 18014398442373116
min seg size (bytes) = 1
------ Semaphore Limits --------
max number of arrays = 32000
max semaphores per array = 32000
max semaphores system wide = 1024000000
max ops per semop call = 500
semaphore max value = 32767
# ipcs | wc -l
16305
# ipcs |more
# ipcs -u
------ Messages Status --------
allocated queues = 0
used headers = 0
used space = 0 bytes
------ Shared Memory Status --------
segments allocated 4
pages allocated 7
pages resident 7
pages swapped 0
Swap performance: 0 attempts 0 successes
------ Semaphore Status --------
used arrays = 24642
allocated semaphores = 345186
发现是 root 用户的 sem 太多太多
# ipcs -t
10748228 root Mon Jun 17 18:49:54 2019 Mon Jun 17 18:49:54 2019
10780997 root Mon Jun 17 18:50:04 2019 Mon Jun 17 18:50:04 2019
10813766 root Mon Jun 17 18:50:14 2019 Mon Jun 17 18:50:14 2019
10846535 root Mon Jun 17 18:50:25 2019 Mon Jun 17 18:50:25 2019
10879304 root Mon Jun 17 18:50:35 2019 Mon Jun 17 18:50:35 2019
10912073 root Mon Jun 17 18:50:45 2019 Mon Jun 17 18:50:45 2019
10944842 root Mon Jun 17 18:50:55 2019 Mon Jun 17 18:50:55 2019
截取了部分结果,经过观察发现,基本是每10s增加一组,如果不释放的话,sem 组和sem 总量达到 kernel.sem 是必然.
结合 syslog 分析后发现,是 zabbix-agent 一直启动失败导致的 sem 耗尽.
Jun 20 15:34:03 PGNODE1 systemd[1]: zabbix-agent.service: Service hold-off time over, scheduling restart.
Jun 20 15:34:03 PGNODE1 systemd[1]: Stopped Zabbix Agent.
Jun 20 15:34:03 PGNODE1 systemd[1]: Starting Zabbix Agent...
Jun 20 15:34:03 PGNODE1 systemd[1]: zabbix-agent.service: PID file /run/zabbix/zabbix_agentd.pid not readable (yet?) after start: No such file or directory
Jun 20 15:34:03 PGNODE1 systemd[1]: zabbix-agent.service: Supervising process 87249 which is not our child. We'll most likely not notice when it exits.
Jun 20 15:34:03 PGNODE1 systemd[1]: Started Zabbix Agent.
Jun 20 15:34:03 PGNODE1 systemd[1]: zabbix-agent.service: Main process exited, code=exited, status=1/FAILURE
Jun 20 15:34:03 PGNODE1 systemd[1]: zabbix-agent.service: Unit entered failed state.
Jun 20 15:34:03 PGNODE1 systemd[1]: zabbix-agent.service: Failed with result 'exit-code'.
查看 zabbix-agent.service ,看到 RestartSec=10s,符合预期.
# cat /lib/systemd/system/zabbix-agent.service
[Unit]
Description=Zabbix Agent
After=syslog.target
After=network.target
[Service]
Environment="CONFFILE=/etc/zabbix/zabbix_agentd.conf"
EnvironmentFile=-/etc/default/zabbix-agent
Type=forking
Restart=on-failure
PIDFile=/run/zabbix/zabbix_agentd.pid
KillMode=control-group
ExecStart=/usr/sbin/zabbix_agentd -c $CONFFILE
ExecStop=/bin/kill -SIGTERM $MAINPID
RestartSec=10s
[Install]
WantedBy=multi-user.target
哈哈,看这里 RestartSec=10s
记录下释放所有已分配的共享内存及信号量
# ipcs -m | awk '$2 ~ /[0-9]+/ {print $2}' | while read s; do sudo ipcrm -m $s; done
# ipcs -s | awk '$2 ~ /[0-9]+/ {print $2}' | while read s; do sudo ipcrm -s $s; done
记录下设置 kernel.sem 时,其顺序是:
kernel.sem=SEMMSL SEMMNS SEMOPM SEMMNI
kernel.sem = 250 32000 100 128
kernel.sem = 50100 64128000 50100 1280
kernel.sem = 32000 1024000000 500 32000
SEMMSL: 每组需要多少信号量, 代表每个信号集中的最大信号量数目 。
SEMMNS: 需要多少信号量, 代表系统范围内的最大信号量总数目(第二列 = 第一列 * 第四列 )。
SEMOPM: 代表每个信号发生时的最大系统操作数目。
SEMMNI: 需要多少组, 代表系统范围内的最大信号集总数目。
limits.conf
顺带查看下 limits.conf
# vi /etc/security/limits.conf
* soft nofile 655360
* hard nofile 655360
* soft nproc 655360
* hard nproc 655360
* soft memlock unlimited
* hard memlock unlimited
* soft core unlimited
* hard core unlimited
* soft stack unlimited
* hard stack unlimited
参考:
https://www.cnblogs.com/MartinChentf/p/6057100.html
https://blog.csdn.net/huangyimo/article/details/80236181
https://yq.aliyun.com/articles/510665?spm=5176.10695662.1996646101.searchclickresult.e5126661CT1jkR
http://blog.itpub.net/26736162/viewspace-2112518
# ipcs |more
------ Message Queues --------
key msqid owner perms used-bytes messages
------ Shared Memory Segments --------
key shmid owner perms bytes nattch status
0x00000000 0 root 644 80 2
0x00000000 32769 root 644 16384 2
0x00000000 65538 root 644 280 2
------ Semaphore Arrays --------
key semid owner perms nsems
0x00000000 1341751362 root 600 14
0x00000000 1341784131 root 600 14
0x00000000 1341816900 root 600 14
0x00000000 1341849669 root 600 14
0x00000000 1341882438 root 600 14
0x00000000 1341915207 root 600 14
0x00000000 1341947976 root 600 14
0x00000000 1341980745 root 600 14
0x00000000 1342013514 root 600 14
0x00000000 1342046283 root 600 14
0x00000000 1342079052 root 600 14
0x00000000 1342111821 root 600 14
0x00000000 1342144590 root 600 14
0x00000000 1342177359 root 600 14
0x00000000 1342210128 root 600 14
0x00000000 1342242897 root 600 14
0x00000000 1342275666 root 600 14
0x00000000 1342308435 root 600 14
0x00000000 1342341204 root 600 14
0x00000000 1342373973 root 600 14
0x00000000 1342406742 root 600 14
0x00000000 1342439511 root 600 14
0x00000000 1342472280 root 600 14
0x00000000 1342505049 root 600 14
0x00000000 1342537818 root 600 14
0x00000000 1342570587 root 600 14
0x00000000 1342603356 root 600 14
0x00000000 1342636125 root 600 14
0x00000000 1342668894 root 600 14
0x00000000 1342701663 root 600 14
0x00000000 1342734432 root 600 14
0x00000000 1342767201 root 600 14
0x00000000 1342799970 root 600 14
0x00000000 1342832739 root 600 14
0x00000000 1342865508 root 600 14
0x00000000 1342898277 root 600 14
0x00000000 1342931046 root 600 14
0x00000000 1342963815 root 600 14
0x00000000 1342996584 root 600 14
0x00000000 1343029353 root 600 14
0x00000000 1343062122 root 600 14
0x00000000 1343094891 root 600 14
0x00000000 1343127660 root 600 14
0x00000000 1343160429 root 600 14
0x00000000 1343193198 root 600 14
0x00000000 1343225967 root 600 14
0x00000000 1343258736 root 600 14
0x00000000 1343291505 root 600 14
0x00000000 1343324274 root 600 14
0x00000000 1343357043 root 600 14
0x00000000 1343389812 root 600 14
0x00000000 1343422581 root 600 14
0x00000000 1343455350 root 600 14
0x00000000 1343488119 root 600 14
0x00000000 1343520888 root 600 14
0x00000000 1343553657 root 600 14
0x00000000 1343586426 root 600 14
0x00000000 1343619195 root 600 14
0x00000000 1343651964 root 600 14
0x00000000 249896281 root 600 14
0x00000000 249929050 root 600 14
0x00000000 249961819 root 600 14
0x00000000 249994588 root 600 14
0x00000000 250027357 root 600 14
0x00000000 250060126 root 600 14
0x00000000 250092895 root 600 14
0x00000000 250125664 root 600 14
0x00000000 250158433 root 600 14
0x00000000 250191202 root 600 14
0x00000000 250223971 root 600 14
0x00000000 250256740 root 600 14
0x00000000 250289509 root 600 14
0x00000000 250322278 root 600 14
0x00000000 250355047 root 600 14
0x00000000 250387816 root 600 14
0x00000000 250420585 root 600 14
0x00000000 250453354 root 600 14