diff --git a/HERCULES_READER_COMMANDS.md b/HERCULES_READER_COMMANDS.md new file mode 100644 index 0000000..7022b08 --- /dev/null +++ b/HERCULES_READER_COMMANDS.md @@ -0,0 +1,131 @@ +# Hercules Card Reader Device Commands + +This document contains the Hercules console commands used to manage the card reader device (00C/3505). + +## View Device Status + +``` +devlist 00c +``` + +**Example Output:** +``` +HHC02279I 0:000C 3505 3505 sockdev ascii trunc eof IO[2] +HHC02279I (no one currently connected) +``` + +**Key Information:** +- `IO[n]` - Shows the I/O state number +- `(no one currently connected)` - Socket is free +- `(client IP () connected)` - Socket is in use + +## Reset Card Reader (When Stuck) + +### Method 1: Detach and Reattach (Recommended) + +``` +detach 00c +attach 00c 3505 3505 sockdev ascii trunc eof +``` + +This completely removes and recreates the device, clearing any stuck connections. + +### Method 2: Devinit (May Not Work if Socket is Wedged) + +``` +devinit 00c 3505 3505 sockdev ascii trunc eof +``` + +**Note:** `devinit` may fail with "Address already in use" if the socket is stuck. Use Method 1 instead. + +## Common Issues + +### Device Stuck in "IO[n] open" State + +**Symptoms:** +- `devlist 00c` shows `IO[5] open` or similar +- Client shown as connected even after jobs complete +- New connections rejected with "device busy or interrupt pending" + +**Solution:** +``` +detach 00c +attach 00c 3505 3505 sockdev ascii trunc eof +``` + +### Connection Refused from Workflow + +**Symptoms:** +- Workflow logs show: `Connection refused` from netcat +- Device shows: `localhost:3505` in devlist + +**Problem:** Device bound to localhost only, not accepting external connections. + +**Solution:** +Use port number only (binds to all interfaces): +``` +detach 00c +attach 00c 3505 3505 sockdev ascii trunc eof +``` + +**DO NOT USE:** +``` +attach 00c 3505 localhost:3505 sockdev ascii trunc eof # Wrong - localhost only +attach 00c 3505 0.0.0.0:3505 sockdev ascii trunc eof # Also fails on some systems +``` + +## Device Configuration for Workflow + +For the Gitea workflow to connect successfully, the card reader must be configured to accept connections from any interface: + +``` +attach 00c 3505 3505 sockdev ascii trunc eof +``` + +Verify with: +``` +devlist 00c +``` + +Should show: +``` +HHC02279I 0:000C 3505 3505 sockdev ascii trunc eof IO[n] +``` + +**Not:** +``` +HHC02279I 0:000C 3505 localhost:3505 sockdev ascii trunc eof IO[n] # Wrong +``` + +## Troubleshooting Workflow Socket Issues + +If workflows are failing with socket errors: + +1. **Check device status:** + ``` + devlist 00c + ``` + +2. **Look for stuck connections:** + - `IO[5] open` with client connected = stuck + +3. **Reset the device:** + ``` + detach 00c + attach 00c 3505 3505 sockdev ascii trunc eof + ``` + +4. **Verify it's ready:** + ``` + devlist 00c + ``` + Should show `(no one currently connected)` + +## Prevention + +The `submit_job.py` script now includes a 10-second delay between submissions to prevent socket conflicts: +- Upload job submits +- Wait 10 seconds for JES to process +- Compile job submits + +This gives Hercules time to fully close the socket between connections. diff --git a/scripts/submit_job.py b/scripts/submit_job.py index 9152024..66537c7 100755 --- a/scripts/submit_job.py +++ b/scripts/submit_job.py @@ -18,31 +18,26 @@ MVSHOST = "oldcomputernerd.com" RDRPORT = 3505 MVS_PASSWORD = os.environ.get("MVS_BATCH_PASSWORD") -def wait_for_port(host, port, timeout=30, poll_interval=1): - """Wait for port to become available (not in use)""" - print(f"Waiting for port {port} on {host} to be ready...") - start_time = time.time() +def wait_for_reader(host, port, wait_seconds=10): + """ + Wait for card reader to finish processing previous job. - while time.time() - start_time < timeout: - try: - # Try to connect - if successful, port is available - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(2) - result = sock.connect_ex((host, port)) - sock.close() + Hercules keeps the socket in IO[n] open state while JES processes + the submitted job. We need to wait long enough for: + 1. JES to read the job from the internal reader + 2. Hercules to close the socket completely + 3. The port to be ready for a new connection - if result == 0: - print(f"Port {port} is ready") - return True - else: - print(f"Port {port} not responding (error {result}), waiting...") - time.sleep(poll_interval) - except socket.error as e: - print(f"Socket error: {e}, retrying...") - time.sleep(poll_interval) - - print(f"Timeout waiting for port {port} to be ready after {timeout}s") - return False + A simple fixed delay is more reliable than trying to probe the port, + since the port will respond even when Hercules will reject connections. + """ + print(f"Waiting {wait_seconds} seconds for card reader to finish processing...") + for i in range(wait_seconds): + time.sleep(1) + if (i + 1) % 3 == 0: + print(f" {wait_seconds - i - 1} seconds remaining...") + print("Card reader should be ready now") + return True def create_jcl_payload(local_file, dataset_name, member_name): @@ -206,10 +201,9 @@ if __name__ == "__main__": if upload_source(local_file, dataset_name, member_name, mvshost) != 0: sys.exit(1) - # Wait for card reader port to be ready before submitting compile job - if not wait_for_port(mvshost, RDRPORT, timeout=30): - print(f"Error: Card reader port {RDRPORT} not available") - sys.exit(1) + # Wait for card reader to finish processing upload job before submitting compile job + # This prevents "device busy or interrupt pending" errors from Hercules + wait_for_reader(mvshost, RDRPORT, wait_seconds=10) # Step 2: Submit JCL job exit_code = submit_jcl(job, mvshost)