Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 181 additions & 0 deletions doajtest/bisector.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/usr/bin/env bash
# language: bash
# Save as tools/find_culprit.sh and make executable (chmod +x ...)

#set -euo pipefail
#
#TMPDIR=$(mktemp -d)
#ALL="$TMPDIR/all_tests.txt"
#OTHERS="$TMPDIR/others.txt"
#LOCKS="$TMPDIR/locks.txt"
#
#pytest --collect-only -q > "$ALL"
#
#grep 'test_lock.py' "$ALL" > "$LOCKS" || true
#grep -v 'test_lock.py' "$ALL" > "$OTHERS" || true
#
#if [ ! -s "$LOCKS" ]; then
# echo "No tests found in test_lock.py"
# rm -rf "$TMPDIR"
# exit 1
#fi
#
#mapfile -t LOCK_ARR < "$LOCKS"
#mapfile -t OTHER_ARR < "$OTHERS"
#N=${#OTHER_ARR[@]}
#
#if [ "$N" -eq 0 ]; then
# echo "No other tests to blame; run `pytest ${LOCK_ARR[*]}` manually."
# rm -rf "$TMPDIR"
# exit 0
#fi
#
#run_with_lock() {
# # $1..$k are nodeids from OTHER_ARR, then append LOCK_ARR
# local nodes=("$@")
# nodes+=("${LOCK_ARR[@]}")
# pytest -q "${nodes[@]}" >/dev/null 2>&1
# return $?
#}
#
#low=0
#high=$N
#
#while [ $((high - low)) -gt 1 ]; do
# mid=$(((low + high) / 2))
# subset=("${OTHER_ARR[@]:0:mid}")
# echo "Testing first $mid of $N tests..."
# if run_with_lock "${subset[@]}"; then
# echo "PASS -> culprit is in tests [$mid..$((N-1))]"
# low=$mid
# else
# echo "FAIL -> culprit is in first $mid tests"
# high=$mid
# fi
#done
#
## Report suspects: small slice around boundary
#start=$((low - 2)); [ $start -lt 0 ] && start=0
#end=$((low + 3)); [ $end -gt $N ] && end=$N
#
#echo
#echo "Suspect tests (run these before test_lock.py to reproduce):"
#for i in $(seq $start $((end - 1))); do
# printf ' %s\n' "${OTHER_ARR[$i]}"
#done
#
#echo
#echo "Re-run one of the suspect sets manually, e.g.:"
#echo " pytest -q ${OTHER_ARR[$start]} ${LOCK_ARR[*]}"
#rm -rf "$TMPDIR"


set -euo pipefail

TMPDIR=$(mktemp -d)
ALL="$TMPDIR/all_tests.txt"
OTHERS="$TMPDIR/others.txt"
LOCKS="$TMPDIR/locks.txt"

# timeout in seconds for running test_lock.py (adjust as needed)
TIMEOUT_SECONDS=65

pytest --collect-only -q > "$ALL"

grep 'test_lock.py' "$ALL" > "$LOCKS" || true
grep -v 'test_lock.py' "$ALL" > "$OTHERS" || true

if [ ! -s "$LOCKS" ]; then
echo "No tests found in test_lock.py"
rm -rf "$TMPDIR"
exit 1
fi

mapfile -t LOCK_ARR < "$LOCKS"
mapfile -t OTHER_ARR < "$OTHERS"
N=${#OTHER_ARR[@]}

if [ "$N" -eq 0 ]; then
echo "No other tests to blame; run \`pytest ${LOCK_ARR[*]}\` manually."
rm -rf "$TMPDIR"
exit 0
fi

run_with_lock() {
# Temporarily disable -e so failures/timeouts don't exit the whole script
set +e
local nodes=("$@")
nodes+=("${LOCK_ARR[@]}")

if command -v timeout >/dev/null 2>&1; then
# Use GNU timeout; --preserve-status keeps pytest exit codes, --foreground helps pytest handle signals
timeout --preserve-status --foreground "${TIMEOUT_SECONDS}s" pytest -q "${nodes[@]}" >/dev/null 2>&1
rc=$?
# timeout returns 124 on timeout, 137 if killed; treat these as failure (non-zero)
if [ "$rc" -eq 124 ] || [ "$rc" -eq 137 ]; then
echo "pytest timed out after ${TIMEOUT_SECONDS}s"
rc=1
fi
set -e
return $rc
else
# Portable fallback: run pytest in background and kill it if it exceeds TIMEOUT_SECONDS
pytest -q "${nodes[@]}" >/dev/null 2>&1 &
pid=$!
end_time=$((SECONDS + TIMEOUT_SECONDS))
rc=0
while kill -0 "$pid" >/dev/null 2>&1; do
if [ "$SECONDS" -ge "$end_time" ]; then
echo "pytest timed out after ${TIMEOUT_SECONDS}s; terminating (pid $pid)"
kill -TERM "$pid" 2>/dev/null || true
sleep 5
kill -KILL "$pid" 2>/dev/null || true
wait "$pid" 2>/dev/null || true
rc=1
break
fi
sleep 1
done
# If process exited normally before timeout, capture its exit code
if kill -0 "$pid" >/dev/null 2>&1; then
# already handled timeout and killed
:
else
wait "$pid" >/dev/null 2>&1 || rc=$?
# if pytest returned non-zero, rc will reflect that
fi
set -e
return $rc
fi
}

low=0
high=$N

while [ $((high - low)) -gt 1 ]; do
mid=$(((low + high) / 2))
subset=("${OTHER_ARR[@]:0:mid}")
echo "Testing first $mid of $N tests..."
if run_with_lock "${subset[@]}"; then
echo "PASS -> culprit is in tests [$mid..$((N-1))]"
low=$mid
else
echo "FAIL -> culprit is in first $mid tests"
high=$mid
fi
done

# Report suspects: small slice around boundary
start=$((low - 2)); [ $start -lt 0 ] && start=0
end=$((low + 3)); [ $end -gt $N ] && end=$N

echo
echo "Suspect tests (run these before test_lock.py to reproduce):"
for i in $(seq $start $((end - 1))); do
printf ' %s\n' "${OTHER_ARR[$i]}"
done

echo
echo "Re-run one of the suspect sets manually, e.g.:"
echo " pytest -q ${OTHER_ARR[$start]} ${LOCK_ARR[*]}"
rm -rf "$TMPDIR"
82 changes: 82 additions & 0 deletions doajtest/unit/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# python
import os
import sys
import threading
import subprocess
import faulthandler
import time

# Name fragment of the test that fails when run in the full suite
_TARGET = "test_lock.py"

def _dump_state(label):
pid = os.getpid()
print(f"\n=== STATE DUMP: {label} (pid={pid}) ===", file=sys.stderr)
# Python threads
print("Threads:", file=sys.stderr)
for t in threading.enumerate():
print(f" {t.name!s} alive={t.is_alive()} daemon={t.daemon}", file=sys.stderr)

# Python-level stacks
print("\nPython thread stacks (faulthandler):", file=sys.stderr)
faulthandler.dump_traceback(file=sys.stderr, all_threads=True)

# child processes whose parent is this pid (may be empty if detached)
print("\nChild processes (ps --ppid):", file=sys.stderr)
try:
subprocess.run(["ps", "-o", "pid,ppid,cmd", "--ppid", str(pid)],
check=False, text=True, stdout=sys.stderr)
except Exception as e:
print(" failed to run ps:", e, file=sys.stderr)

# open file descriptors for this process (Linux /proc)
try:
fds = os.listdir(f"/proc/{pid}/fd")
print(f"\nOpen fds: {len(fds)}", file=sys.stderr)
# print a small sample
for fd in sorted(fds)[:50]:
try:
target = os.readlink(f"/proc/{pid}/fd/{fd}")
except Exception:
target = "<unreadable>"
print(f" fd {fd}: {target}", file=sys.stderr)
except Exception as e:
print(" cannot list /proc fds:", e, file=sys.stderr)

# optional: lsof for full listing if available (may be noisy)
try:
subprocess.run(["lsof", "-p", str(pid)], check=False, text=True, stdout=sys.stderr, stderr=subprocess.DEVNULL)
except Exception:
pass

print("=== END STATE DUMP ===\n", file=sys.stderr)


def pytest_runtest_setup(item):
# keep output small: only emit detailed dumps for the target test or every 50th test
if _TARGET in item.nodeid:
_dump_state(f"before {item.nodeid}")


def pytest_runtest_teardown(item, nextitem):
if _TARGET in item.nodeid:
# short pause so any background cleanup can run, then snapshot state
time.sleep(0.1)
_dump_state(f"after {item.nodeid}")


def pytest_sessionfinish(session, exitstatus):
# keep your existing final dump too (helps when the process hangs after sessionfinish)
print("\n=== pytest_sessionfinish: active threads ===", file=sys.stderr)
for t in threading.enumerate():
print(f" {t.name!s} alive={t.is_alive()} daemon={t.daemon}", file=sys.stderr)

print("\n=== Python thread stacks (faulthandler) ===", file=sys.stderr)
faulthandler.dump_traceback(file=sys.stderr, all_threads=True)

print("\n=== child processes (ps) ===", file=sys.stderr)
try:
subprocess.run(["ps", "-o", "pid,ppid,cmd", "--ppid", str(os.getpid())],
check=False, text=True, stdout=sys.stderr)
except Exception as e:
print(" failed to list children:", e, file=sys.stderr)
14 changes: 13 additions & 1 deletion doajtest/unit/test_lock.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

from doajtest.helpers import DoajTestCase
from portality import models, lock
from portality.lib import dates
Expand All @@ -6,21 +8,29 @@
from datetime import timedelta
from copy import deepcopy

from portality.lib.thread_utils import wait_until

pytestmark = pytest.mark.skip(reason="skip whole module while debugging")

class TestLock(DoajTestCase):

def test_01_lock_success_fail(self):
print("Starting test_01_lock_success_fail")
source = JournalFixtureFactory.make_journal_source()
j = models.Journal(**source)
j.save(blocking=True)
print("saved journal")

# first set a lock
l = lock.lock("journal", j.id, "testuser")
assert l.about == j.id
assert l.type == "journal"
assert l.username == "testuser"
assert not l.is_expired()
print("set lock")

time.sleep(1)
wait_until(lambda: lock.has_lock("journal", j.id, "testuser") is True, timeout=5)
# time.sleep(1)

# now try and set the lock again for the same thing by the same user
l2 = lock.lock("journal", j.id, "testuser")
Expand All @@ -31,6 +41,8 @@ def test_01_lock_success_fail(self):
with self.assertRaises(lock.Locked):
l3 = lock.lock("journal", j.id, "otheruser")

print("completed test_01_lock_success_fail")

def test_02_unlock(self):
unlocked = lock.unlock("journal", "qojoiwjreiqwefoijqwiefjw", "testuser")
assert unlocked is True
Expand Down
4 changes: 3 additions & 1 deletion portality/dao.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,9 @@ def destroy_index(cls, **es_kwargs):
# else:
# return esprit.raw.delete_index(es_connection)
print('Destroying indexes with prefix ' + app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*')
return ES.indices.delete(app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*', **es_kwargs)
resp = ES.indices.delete(app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*', **es_kwargs)
print("Destroyed indexes with prefix " + app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*')
return resp

@classmethod
def check_es_raw_response(cls, res, extra_trace_info=''):
Expand Down