背景
原理与修复步骤
以EDB PG 11为例讲解
环境部署
《MTK使用 - PG,PPAS,oracle,mysql,ms sql,sybase 迁移到 PG, PPAS (支持跨版本升级)》
export PS1="$USER@`/bin/hostname -s`-> "export PGPORT=4000export PGDATA=/data04/ppas11/pg_root4000export LANG=en_US.utf8export PGHOME=/usr/edb/as11export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATHexport DATE=`date +"%Y%m%d%H%M"`export PATH=$PGHOME/bin:$PATH:.export MANPATH=$PGHOME/share/man:$MANPATHexport PGHOST=127.0.0.1export PGUSER=postgresexport PGDATABASE=postgresalias rm='rm -i'alias ll='ls -lh'unalias vi
initdb -D data04/ppas11/pg_root4000 -E UTF8 --lc-collate=C --lc-ctype=en_US.UTF8 -U postgres -k --redwood-like
cd $PGDATAcp $PGHOME/share/recovery.conf.sample ./mv recovery.conf.sample recovery.donevi recovery.donerestore_command = 'cp data04/ppas11/wal/%f %p'recovery_target_timeline = 'latest'standby_mode = onprimary_conninfo = 'host=localhost port=4000 user=postgres'
postgresql.conflisten_addresses = '0.0.0.0'port = 4000max_connections = 8000superuser_reserved_connections = 13unix_socket_directories = '.,/tmp'unix_socket_permissions = 0700tcp_keepalives_idle = 60tcp_keepalives_interval = 10tcp_keepalives_count = 10shared_buffers = 16GBmax_prepared_transactions = 8000maintenance_work_mem = 1GBautovacuum_work_mem = 1GBdynamic_shared_memory_type = posixvacuum_cost_delay = 0bgwriter_delay = 10msbgwriter_lru_maxpages = 1000bgwriter_lru_multiplier = 10.0effective_io_concurrency = 0max_worker_processes = 128max_parallel_maintenance_workers = 8max_parallel_workers_per_gather = 8max_parallel_workers = 24wal_level = replicasynchronous_commit = offfull_page_writes = onwal_compression = onwal_buffers = 32MBwal_writer_delay = 10mscheckpoint_timeout = 25minmax_wal_size = 32GBmin_wal_size = 8GBcheckpoint_completion_target = 0.2archive_mode = onarchive_command = 'cp -n %p data04/ppas11/wal/%f'max_wal_senders = 16wal_keep_segments = 4096max_replication_slots = 16hot_standby = onmax_standby_archive_delay = 300smax_standby_streaming_delay = 300swal_receiver_status_interval = 1swal_receiver_timeout = 10srandom_page_cost = 1.1effective_cache_size = 400GBlog_destination = 'csvlog'logging_collector = onlog_directory = 'log'log_filename = 'edb-%a.log'log_truncate_on_rotation = onlog_rotation_age = 1dlog_rotation_size = 0log_min_duration_statement = 1slog_checkpoints = onlog_error_verbosity = verboselog_line_prefix = '%t 'log_lock_waits = onlog_statement = 'ddl'log_timezone = 'PRC'autovacuum = onlog_autovacuum_min_duration = 0autovacuum_max_workers = 6autovacuum_freeze_max_age = 1200000000autovacuum_multixact_freeze_max_age = 1400000000autovacuum_vacuum_cost_delay = 0statement_timeout = 0lock_timeout = 0idle_in_transaction_session_timeout = 0vacuum_freeze_table_age = 1150000000vacuum_multixact_freeze_table_age = 1150000000datestyle = 'redwood,show_time'timezone = 'PRC'lc_messages = 'en_US.utf8'lc_monetary = 'en_US.utf8'lc_numeric = 'en_US.utf8'lc_time = 'en_US.utf8'default_text_search_config = 'pg_catalog.english'shared_preload_libraries = 'auto_explain,pg_stat_statements,$libdir/dbms_pipe,$libdir/edb_gen,$libdir/dbms_aq'edb_redwood_date = onedb_redwood_greatest_least = onedb_redwood_strings = ondb_dialect = 'redwood'edb_dynatune = 66edb_dynatune_profile = oltptimed_statistics = off
local all all trusthost all all 127.0.0.1/32 trusthost all all ::1/128 trustlocal replication all trusthost replication all 127.0.0.1/32 trusthost replication all ::1/128 trusthost all all 0.0.0.0/0 md5
mkdir data04/ppas11/walchown enterprisedb:enterprisedb data04/ppas11/wal
pg_basebackup -h 127.0.0.1 -p 4000 -D data04/ppas11/pg_root4001 -F p -c fast
cd data04/ppas11/pg_root4001mv recovery.done recovery.confvi postgresql.confport = 4001
pg_ctl start -D data04/ppas11/pg_root4001
pgbench -i -s 1000pgbench -M prepared -v -r -P 1 -c 24 -j 24 -T 300
postgres=# select * from pg_stat_archiver ;archived_count | last_archived_wal | last_archived_time | failed_count | last_failed_wal | last_failed_time | stats_reset----------------+--------------------------+----------------------------------+--------------+-----------------+------------------+----------------------------------240 | 0000000100000000000000F0 | 28-JAN-19 15:08:43.276965 +08:00 | 0 | | | 28-JAN-19 15:01:17.883338 +08:00(1 row)postgres=# select * from pg_stat_archiver ;archived_count | last_archived_wal | last_archived_time | failed_count | last_failed_wal | last_failed_time | stats_reset----------------+--------------------------+----------------------------------+--------------+-----------------+------------------+----------------------------------248 | 0000000100000000000000F8 | 28-JAN-19 15:08:45.120134 +08:00 | 0 | | | 28-JAN-19 15:01:17.883338 +08:00(1 row)
postgres=# select * from pg_stat_replication ;-[ RECORD 1 ]----+---------------------------------pid | 8124usesysid | 10usename | postgresapplication_name | walreceiverclient_addr | 127.0.0.1client_hostname |client_port | 62988backend_start | 28-JAN-19 15:07:34.084542 +08:00backend_xmin |state | streamingsent_lsn | 1/88BC2000write_lsn | 1/88BC2000flush_lsn | 1/88BC2000replay_lsn | 1/88077D48write_lag | 00:00:00.001417flush_lag | 00:00:00.002221replay_lag | 00:00:00.097657sync_priority | 0sync_state | async
例子1,从库激活后产生读写,使用pg_rewind修复从库,回退到只读从库
pg_ctl promote -D data04/ppas11/pg_root4001
pgbench -M prepared -v -r -P 1 -c 4 -j 4 -T 120 -p 4001
enterprisedb@pg11-test-> pg_controldata -D data04/ppas11/pg_root4001|grep -i timeLatest checkpoint's TimeLineID: 1Latest checkpoint's PrevTimeLineID: 1Time of latest checkpoint: Mon 28 Jan 2019 03:56:38 PM CSTMin recovery ending loc's timeline: 2track_commit_timestamp setting: offDate/time type storage: 64-bit integersenterprisedb@pg11-test-> pg_controldata -D data04/ppas11/pg_root4000|grep -i timeLatest checkpoint's TimeLineID: 1Latest checkpoint's PrevTimeLineID: 1Time of latest checkpoint: Mon 28 Jan 2019 05:11:38 PM CSTMin recovery ending loc's timeline: 0track_commit_timestamp setting: offDate/time type storage: 64-bit integers
cd data04/ppas11/pg_root4001ll pg_wal/*.history-rw------- 1 enterprisedb enterprisedb 42 Jan 28 17:15 pg_wal/00000002.historycat pg_wal/00000002.history1 6/48C62000 no recovery target specified
-rw------- 1 enterprisedb enterprisedb 42 Jan 28 17:15 00000002.history-rw------- 1 enterprisedb enterprisedb 16M Jan 28 17:16 000000020000000600000048............
recovery.confrestore_command = 'cp data04/ppas11/wal/%f %p'
pg_rewind --helppg_rewind resynchronizes a PostgreSQL cluster with another copy of the cluster.Usage:pg_rewind [OPTION]...Options:-D, --target-pgdata=DIRECTORY existing data directory to modify--source-pgdata=DIRECTORY source data directory to synchronize with--source-server=CONNSTR source server to synchronize with-n, --dry-run stop before modifying anything-P, --progress write progress messages--debug write a lot of debug messages-V, --version output version information, then exit-?, --help show this help, then exitReport bugs to <support@enterprisedb.com>.
pg_ctl stop -m fast -D data04/ppas11/pg_root4001
pg_rewind -n -D data04/ppas11/pg_root4001 --source-server="hostaddr=127.0.0.1 user=postgres port=4000"servers diverged at WAL location 6/48C62000 on timeline 1rewinding from last common checkpoint at 5/5A8CD30 on timeline 1Done!
pg_rewind -D data04/ppas11/pg_root4001 --source-server="hostaddr=127.0.0.1 user=postgres port=4000"servers diverged at WAL location 6/48C62000 on timeline 1rewinding from last common checkpoint at 5/5A8CD30 on timeline 1Done!
cd data04/ppas11/pg_root4001vi postgresql.confport = 4001mv recovery.done recovery.confvi recovery.confrestore_command = 'cp data04/ppas11/wal/%f %p'recovery_target_timeline = 'latest'standby_mode = onprimary_conninfo = 'host=localhost port=4000 user=postgres'
mkdir /data04/ppas11/wal/error_tl_2mv /data04/ppas11/wal/00000002* /data04/ppas11/wal/error_tl_2
pg_ctl start -D /data04/ppas11/pg_root4001
psqlcheckpoint;
pgbench -M prepared -v -r -P 1 -c 16 -j 16 -T 200 -p 4000
postgres=# select * from pg_stat_archiver ;archived_count | last_archived_wal | last_archived_time | failed_count | last_failed_wal | last_failed_time | stats_reset----------------+--------------------------+----------------------------------+--------------+-----------------+------------------+----------------------------------1756 | 0000000100000006000000DC | 28-JAN-19 17:41:57.562425 +08:00 | 0 | | | 28-JAN-19 15:01:17.883338 +08:00(1 row)
postgres=# select * from pg_stat_replication ;-[ RECORD 1 ]----+--------------------------------pid | 13179usesysid | 10usename | postgresapplication_name | walreceiverclient_addr | 127.0.0.1client_hostname |client_port | 63198backend_start | 28-JAN-19 17:47:29.85308 +08:00backend_xmin |state | catchupsent_lsn | 7/DDE80000write_lsn | 7/DC000000flush_lsn | 7/DC000000replay_lsn | 7/26A8DCB0write_lag | 00:00:18.373263flush_lag | 00:00:18.373263replay_lag | 00:00:18.373263sync_priority | 0sync_state | async
例子2,从库激活成为新主库后,老主库依旧有读写,使用pg_rewind修复老主库,将老主库降级为新主库的从库
pg_ctl promote -D /data04/ppas11/pg_root4001
pgbench -M prepared -v -r -P 1 -c 16 -j 16 -T 200 -p 4001
pgbench -M prepared -v -r -P 1 -c 16 -j 16 -T 200 -p 4000
enterprisedb@pg11-test-> pg_controldata -D /data04/ppas11/pg_root4000|grep -i timelineLatest checkpoint's TimeLineID: 1Latest checkpoint's PrevTimeLineID: 1Min recovery ending loc's timeline: 0enterprisedb@pg11-test-> pg_controldata -D /data04/ppas11/pg_root4001|grep -i timelineLatest checkpoint's TimeLineID: 1Latest checkpoint's PrevTimeLineID: 1Min recovery ending loc's timeline: 2enterprisedb@pg11-test-> cd /data04/ppas11/pg_root4001/pg_walenterprisedb@pg11-test-> cat 00000002.history1 8/48DE2318 no recovery target specifiedenterprisedb@pg11-test-> ll *.partial-rw------- 1 enterprisedb enterprisedb 16M Jan 28 17:48 000000010000000800000048.partial
recovery.confrestore_command = 'cp /data04/ppas11/wal/%f %p'
pg_ctl stop -m fast -D /data04/ppas11/pg_root4000
pg_rewind -n -D /data04/ppas11/pg_root4000 --source-server="hostaddr=127.0.0.1 user=postgres port=4001"servers diverged at WAL location 8/48DE2318 on timeline 1rewinding from last common checkpoint at 6/CCCEF770 on timeline 1Done!
pg_rewind -D /data04/ppas11/pg_root4000 --source-server="hostaddr=127.0.0.1 user=postgres port=4001"
cd /data04/ppas11/pg_root4000vi postgresql.confport = 4000mv recovery.done recovery.confvi recovery.confrestore_command = 'cp /data04/ppas11/wal/%f %p'recovery_target_timeline = 'latest'standby_mode = onprimary_conninfo = 'host=localhost port=4001 user=postgres'
pg_ctl start -D /data04/ppas11/pg_root4000
checkpoint;
pgbench -M prepared -v -r -P 1 -c 16 -j 16 -T 200 -p 4001
psql -p 4001postgres=# select * from pg_stat_archiver ;archived_count | last_archived_wal | last_archived_time | failed_count | last_failed_wal | last_failed_time | stats_reset----------------+--------------------------+----------------------------------+--------------+-----------------+------------------+----------------------------------406 | 0000000200000009000000DB | 28-JAN-19 21:18:22.976118 +08:00 | 0 | | | 28-JAN-19 17:47:29.847488 +08:00(1 row)
psql -p 4001postgres=# select * from pg_stat_replication ;-[ RECORD 1 ]----+---------------------------------pid | 17675usesysid | 10usename | postgresapplication_name | walreceiverclient_addr | 127.0.0.1client_hostname |client_port | 60530backend_start | 28-JAN-19 21:18:36.472197 +08:00backend_xmin |state | streamingsent_lsn | 9/E8361C18write_lsn | 9/E8361C18flush_lsn | 9/E8361C18replay_lsn | 9/D235B520write_lag | 00:00:00.000101flush_lag | 00:00:00.000184replay_lag | 00:00:03.028098sync_priority | 0sync_state | async
小结
1 适合场景
2 前提
initdb -k 开启data_checksums
3 原理与修复流程
最后修改时间:2021-03-19 11:13:11
文章转载自PostgreSQL中文社区,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




