Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
c0c722b
Extended transient API of QMeta to read workers-to-chunks map from da…
iagaponenko Apr 5, 2024
9f411e4
CzarFamilyMap create now waits for a successful read.
jgates108 Jul 22, 2024
8cabd97
Changed Czar to catch 5GB limit.
jgates108 Dec 18, 2024
154b22b
Extended transient API of QMeta to read workers-to-chunks map from da…
iagaponenko Apr 5, 2024
cb844de
Czar and workers can send http messages to each other.
jgates108 May 16, 2024
7e288c5
Added cancellation code and for queries, uberjobs, and czar restart.
jgates108 Sep 3, 2024
ec62110
Added worker believed czar was dead handling.
jgates108 Oct 1, 2024
1df00f0
Changed Czar to catch 5GB limit.
jgates108 Dec 18, 2024
dd50482
Added family map option to not use chunk size for distribution.
jgates108 Feb 12, 2025
e202f32
Added JobErrorMsg.
jgates108 Apr 29, 2025
9eca274
Removed protobufs.
jgates108 May 9, 2025
cad684f
Added worker executable.
jgates108 May 14, 2025
2645183
Improved implementatin of the chunk map building algorithm
iagaponenko Jun 25, 2025
c1b7634
Added memory/disk hybrid for transfering csv files.
jgates108 Jul 14, 2025
a218161
CzarFamilyMap create now waits for a successful read.
jgates108 Jul 22, 2024
45437d3
Added worker executable.
jgates108 May 14, 2025
968230a
Added memory/disk hybrid for transfering csv files.
jgates108 Jul 14, 2025
96b1950
Added worker exe, memory/disk hybrid for transfering csv files, fixed…
jgates108 Dec 18, 2024
f30208e
UberJobReadyMsg transmit retries are now sent via WorkerCzarComIssue …
jgates108 Nov 19, 2025
c7b8c74
Rebases fixes.
jgates108 Nov 13, 2025
c12c8b3
Merge pull request #964 from lsst/tickets/DM-51870
jgates108 Oct 13, 2025
a239925
Review changes
jgates108 Dec 11, 2025
8047161
Merge pull request #976 from lsst/tickets/DM-52880
jgates108 Nov 17, 2025
de8099d
Merge pull request #978 from lsst/tickets/DM-53242
jgates108 Dec 15, 2025
6761aa8
cmake cleanups
fritzm Oct 18, 2025
63978a6
Whitespace cleanups
fritzm Oct 18, 2025
27945b4
Don't build DEBUG by default
fritzm Oct 18, 2025
f28d670
Fix USING_VMUTEX compiler warn
fritzm Oct 16, 2025
9e54a6c
Worker app cleanups
fritzm Oct 19, 2025
1c3445a
Foreman cleanups
fritzm Oct 18, 2025
1866fbb
Support explicitly named workers
fritzm Oct 25, 2025
55d8409
Use password for qsreplica user
fritzm Oct 27, 2025
f1dd77f
Rename .cnf to .cfg consistently
fritzm Dec 10, 2025
4cb86b5
Merge branch 'tickets/DM-53921' into tickets/DM-43715
fritzm Jan 24, 2026
9a2283e
Support empty chunk map
fritzm Oct 26, 2025
b20c52d
Remove local db check at czar launch
fritzm Oct 29, 2025
300a3e6
Merge branch 'tickets/DM-54066' into tickets/DM-43715
fritzm Feb 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
24 changes: 4 additions & 20 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -305,37 +305,21 @@ jobs:
if: always()
run: docker logs ${USER}-czar-http-1

- name: Czar CMSD Log
if: always()
run: docker logs ${USER}-czar-cmsd-1

- name: Czar XROOTD Log
if: always()
run: docker logs ${USER}-czar-xrootd-1

- name: Czar MariaDB Log
if: always()
run: docker logs ${USER}-czar-mariadb-1

- name: Qzerv Worker 0 CMSD Log
- name: Qzerv Worker 0 worker-svc Log
if: always()
run: docker logs ${USER}-worker-cmsd-0-1

- name: Qzerv Worker 0 XROOTD Log
if: always()
run: docker logs ${USER}-worker-xrootd-0-1
run: docker logs ${USER}-worker-svc-0-1

- name: Qzerv Worker 0 MariaDB Log
if: always()
run: docker logs ${USER}-worker-mariadb-0-1

- name: Qzerv Worker 1 CMSD Log
if: always()
run: docker logs ${USER}-worker-cmsd-1-1

- name: Qzerv Worker 1 XROOTD Log
- name: Qzerv Worker 1 worker-svc Log
if: always()
run: docker logs ${USER}-worker-xrootd-1-1
run: docker logs ${USER}-worker-svc-1-1

- name: Qzerv Worker 1 MariaDB Log
if: always()
Expand Down
174 changes: 36 additions & 138 deletions deploy/compose/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,11 @@ x-log-volume:
- type: bind
source: ./log/
target: /config-etc/log/
x-worker-cmsd:
&worker-cmsd
image: "${QSERV_IMAGE:?err}"
init: true
# ports are published in worker-xrootd because this container uses that container's network stack.
x-worker-xrootd:
&worker-xrootd
x-worker-svc:
&worker-svc
image: "${QSERV_IMAGE:?err}"
init: true
expose:
- "1094"
- "2131"
- "3306" # for the worker db, which shares this container's network stack.
x-repl-worker:
&repl-worker
Expand All @@ -43,6 +36,7 @@ volumes:
volume_czar_xrootd:
volume_czar_home:
volume_czar_cfg:
volume_czar_transfer:

volume_czar_mariadb_data:
volume_czar_mariadb_cfg:
Expand All @@ -53,14 +47,12 @@ volumes:

volume_worker_0_data:
volume_worker_0_results:
volume_worker_0_xrootd:
volume_worker_0_home:
volume_worker_0_mariadb_lib:
volume_worker_0_mariadb_run:

volume_worker_1_data:
volume_worker_1_results:
volume_worker_1_xrootd:
volume_worker_1_home:
volume_worker_1_mariadb_lib:
volume_worker_1_mariadb_run:
Expand Down Expand Up @@ -97,29 +89,25 @@ services:
- type: volume
source: volume_worker_0_mariadb_run
target: /var/run/mysqld # This is where the mariadb container puts the socket file
network_mode: "service:worker-xrootd-0"
worker-xrootd-0:
<< : *worker-xrootd
network_mode: "service:worker-svc-0"

worker-svc-0:
<< : *worker-svc
command: >
entrypoint worker-xrootd
entrypoint worker-svc
--db-uri mysql://qsmaster:CHANGEME@127.0.0.1:3306
--db-admin-uri mysql://root:CHANGEME@127.0.0.1:3306
--vnid-config "@/usr/local/lib64/libreplica.so {{db_uri}}/qservw_worker 0 0"
--repl-instance-id qserv_proj
--repl-auth-key replauthkey
--repl-admin-auth-key=repladminauthkey
--repl-registry-host repl-registry
--repl-registry-port 25082
--results-dirname /qserv/data/results
--cmsd-manager-name czar-xrootd
--log-cfg-file=/config-etc/log/log-worker-xrootd.cnf
--log-cfg-file=/config-etc/log/log-worker-svc.cfg
volumes:
- type: volume
source: volume_worker_0_results
target: /qserv/data/results
- type: volume
source: volume_worker_0_xrootd
target: /var/run/xrootd
- type: volume
source: volume_worker_0_home
target: /home/qserv
Expand All @@ -130,43 +118,14 @@ services:
networks:
default:
aliases:
- worker-cmsd-0
- worker-mariadb-0
worker-cmsd-0:
<< : *worker-cmsd
command: >
entrypoint worker-cmsd
--db-uri mysql://qsmaster:CHANGEME@worker-mariadb-0:3306
--vnid-config "@/usr/local/lib64/libreplica.so mysql://qsmaster:CHANGEME@127.0.0.1:3306/qservw_worker 0 0"
--results-dirname /qserv/data/results
--repl-instance-id qserv_proj
--repl-auth-key replauthkey
--repl-admin-auth-key=repladminauthkey
--repl-registry-host repl-registry
--repl-registry-port 25082
--cmsd-manager-name czar-xrootd
network_mode: "service:worker-xrootd-0"
volumes:
- type: volume
source: volume_worker_0_results
target: /qserv/data/results
- type: volume
source: volume_worker_0_xrootd
target: /var/run/xrootd
- type: volume
source: volume_worker_0_home
target: /home/qserv
- type: volume
source: volume_worker_0_mariadb_run
target: /qserv/mariadb/run # This matches the ?socket=... location in --db-uri and --db-admin-uri
- << : *log-volume
repl-worker-0:
<< : *repl-worker
command: >
entrypoint worker-repl
--db-admin-uri mysql://root:CHANGEME@worker-mariadb-0:3306/qservw_worker
--repl-connection mysql://qsreplica@repl-mariadb:3306/qservReplica
--log-cfg-file=/config-etc/log/log-repl-worker.cnf
--repl-connection mysql://qsreplica:CHANGEME@repl-mariadb:3306/qservReplica
--log-cfg-file=/config-etc/log/log-repl-worker.cfg
--
--instance-id=qserv_proj
--auth-key=replauthkey
Expand All @@ -184,6 +143,7 @@ services:
source: volume_worker_0_home
target: /home/qserv
- << : *log-volume

# worker 1 uses and validates socket file (where possible) to connect to the worker-mariadb
worker-mariadb-1:
<< : *worker-mariadb
Expand All @@ -201,30 +161,26 @@ services:
- type: volume
source: volume_worker_1_mariadb_run
target: /var/run/mysqld # This is where the mariadb container puts the socket file
network_mode: "service:worker-xrootd-1"
worker-xrootd-1:
<< : *worker-xrootd
network_mode: "service:worker-svc-1"

worker-svc-1:
<< : *worker-svc
command: >
entrypoint --log-level DEBUG worker-xrootd
entrypoint --log-level DEBUG worker-svc
--db-uri mysql://qsmaster:CHANGEME@127.0.0.1:3306?socket={{db_socket}}
--db-admin-uri mysql://root:CHANGEME@127.0.0.1:3306?socket={{db_socket}}
--vnid-config "@/usr/local/lib64/libreplica.so mysql://qsmaster:CHANGEME@127.0.0.1:3306/qservw_worker 0 0"
--repl-instance-id qserv_proj
--repl-auth-key replauthkey
--repl-admin-auth-key=repladminauthkey
--repl-registry-host repl-registry
--repl-registry-port 25082
--results-dirname /qserv/data/results
--cmsd-manager-name czar-xrootd
--targs db_socket=/qserv/mariadb/run/mysqld.sock
--log-cfg-file=/config-etc/log/log-worker-xrootd.cnf
--log-cfg-file=/config-etc/log/log-worker-svc.cfg
volumes:
- type: volume
source: volume_worker_1_results
target: /qserv/data/results
- type: volume
source: volume_worker_1_xrootd
target: /var/run/xrootd
- type: volume
source: volume_worker_1_home
target: /home/qserv
Expand All @@ -235,44 +191,15 @@ services:
networks:
default:
aliases:
- worker-cmsd-1
- worker-mariadb-1
worker-cmsd-1:
<< : *worker-cmsd
command: >
entrypoint --log-level DEBUG worker-cmsd
--db-uri mysql://qsmaster:CHANGEME@worker-mariadb-1:3306?socket=/qserv/mariadb/run/mysqld.sock
--vnid-config "@/usr/local/lib64/libreplica.so mysql://qsmaster:CHANGEME@127.0.0.1:3306/qservw_worker 0 0"
--results-dirname /qserv/data/results
--repl-instance-id qserv_proj
--repl-auth-key replauthkey
--repl-admin-auth-key=repladminauthkey
--repl-registry-host repl-registry
--repl-registry-port 25082
--cmsd-manager-name czar-xrootd
network_mode: "service:worker-xrootd-1"
volumes:
- type: volume
source: volume_worker_1_results
target: /qserv/data/results
- type: volume
source: volume_worker_1_xrootd
target: /var/run/xrootd
- type: volume
source: volume_worker_1_home
target: /home/qserv
- type: volume
source: volume_worker_1_mariadb_run
target: /qserv/mariadb/run
- << : *log-volume
repl-worker-1:
<< : *repl-worker
# qserv-replica-worker app does not support socket file yet.
command: >
entrypoint worker-repl
--db-admin-uri mysql://root:CHANGEME@worker-mariadb-1:3306/qservw_worker
--repl-connection mysql://qsreplica@repl-mariadb:3306/qservReplica
--log-cfg-file=/config-etc/log/log-repl-worker.cnf
--repl-connection mysql://qsreplica:CHANGEME@repl-mariadb:3306/qservReplica
--log-cfg-file=/config-etc/log/log-repl-worker.cfg
--
--instance-id=qserv_proj
--auth-key=replauthkey
Expand All @@ -290,42 +217,7 @@ services:
source: volume_worker_1_home
target: /home/qserv
- << : *log-volume
czar-xrootd:
image: "${QSERV_IMAGE:?err}"
init: true
command: >
entrypoint xrootd-manager
--cmsd-manager-name czar-xrootd
hostname: czar-xrootd
expose:
- "1094"
- "2131"
volumes:
- type: volume
source: volume_czar_xrootd
target: /var/run/xrootd
- type: volume
source: volume_worker_1_home
target: /home/qserv
- << : *log-volume
networks:
default:
aliases:
- czar-cmsd
czar-cmsd:
image: "${QSERV_IMAGE:?err}"
init: true
# NOTE!! cms-delay-servers must match the number of workers being launched!
command: entrypoint cmsd-manager --cms-delay-servers 2
network_mode: "service:czar-xrootd"
volumes:
- type: volume
source: volume_czar_xrootd
target: /var/run/xrootd
- type: volume
source: volume_czar_home
target: /home/qserv
- << : *log-volume

czar-mariadb:
image: "${QSERV_MARIADB_IMAGE:?err}"
init: true
Expand All @@ -349,6 +241,7 @@ services:
- type: volume
source: volume_czar_mariadb_run
target: /var/run/mysqld

czar-proxy:
image: "${QSERV_IMAGE:?err}"
init: true
Expand All @@ -357,8 +250,7 @@ services:
--db-uri mysql://qsmaster:CHANGEME@127.0.0.1:3306?socket={{db_socket}}
--db-admin-uri mysql://root:CHANGEME@127.0.0.1:3306?socket={{db_socket}}
--targs db_socket=/qserv/mariadb/run/mysqld.sock
--xrootd-manager czar-xrootd
--log-cfg-file=/config-etc/log/log-czar-proxy.cnf
--log-cfg-file=/config-etc/log/log-czar-proxy.cfg
--repl-instance-id qserv_proj
--repl-auth-key replauthkey
--repl-admin-auth-key=repladminauthkey
Expand All @@ -377,6 +269,10 @@ services:
- type: volume
source: volume_czar_mariadb_run
target: /qserv/mariadb/run
- type: volume
source: volume_czar_transfer
target: /tmp

- << : *log-volume
expose:
- "3306" # for czar-mariadb
Expand All @@ -393,7 +289,6 @@ services:
command: >
entrypoint --log-level DEBUG czar-http
--db-uri mysql://qsmaster:CHANGEME@czar-mariadb:3306/
--xrootd-manager czar-xrootd
--czar-name http
--http-port 4048
--http-threads 4
Expand All @@ -404,7 +299,7 @@ services:
--http-conn-pool-size 2
--user qsmaster
--password CHANGEME
--log-cfg-file=/config-etc/log/log-czar-proxy.cnf
--log-cfg-file=/config-etc/log/log-czar-proxy.cfg
--repl-instance-id qserv_proj
--repl-auth-key replauthkey
--repl-admin-auth-key=repladminauthkey
Expand All @@ -414,6 +309,9 @@ services:
- type: volume
source: volume_czar_cfg
target: /config-etc
- type: volume
source: volume_czar_transfer
target: /tmp
- type: volume
source: volume_czar_home
target: /home/qserv
Expand Down Expand Up @@ -445,18 +343,18 @@ services:
init: true
command: >
entrypoint --log-level DEBUG replication-controller
--db-uri mysql://qsreplica@repl-mariadb:3306/qservReplica
--db-uri mysql://qsreplica:CHANGEME@repl-mariadb:3306/qservReplica
--db-admin-uri mysql://root:CHANGEME@repl-mariadb:3306/qservReplica
--qserv-czar-db=mysql://root:CHANGEME@czar-mariadb:3306/qservMeta
--log-cfg-file=/config-etc/log/log-repl-controller.cnf
--log-cfg-file=/config-etc/log/log-repl-controller.cfg
--
--instance-id=qserv_proj
--auth-key=replauthkey
--admin-auth-key=repladminauthkey
--xrootd-host=czar-xrootd
--registry-host=repl-registry
--controller-auto-register-workers=1
--qserv-sync-force
--qserv-chunk-map-update
--debug
expose:
- "25081"
Expand All @@ -478,9 +376,9 @@ services:
init: true
command: >
entrypoint --log-level DEBUG replication-registry
--db-uri mysql://qsreplica@repl-mariadb:3306/qservReplica
--db-uri mysql://qsreplica:CHANGEME@repl-mariadb:3306/qservReplica
--db-admin-uri mysql://root:CHANGEME@repl-mariadb:3306/qservReplica
--log-cfg-file=/config-etc/log/log-repl-registry.cnf
--log-cfg-file=/config-etc/log/log-repl-registry.cfg
--
--instance-id=qserv_proj
--auth-key=replauthkey
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=%d{yyyy-MM-ddTHH:mm:ss.SSSZ} LWP %-5X{LWP} %-5p %m%n

log4j.logger.lsst.qserv.xrdssi.msgs=WARN
Loading