5 changed files with 6 additions and 514 deletions
--- a/piker/storage/cli.py
+++ b/piker/storage/cli.py
@ -441,35 +441,9 @@ def ldshm(
                    wdts,
                    deduped,
                    diff,
-                    valid_races,
+                ) = tsp.dedupe(
                    dq_issues,
                ) = tsp.dedupe_ohlcv_smart(
                    shm_df,
-                )
+                    period=period_s,
                # Report duplicate analysis
                if diff > 0:
                    log.info(
                        f'Removed {diff} duplicate timestamp(s)\n'
                    )
                    if valid_races is not None:
                        identical: int = (
                            valid_races
                            .filter(pl.col('identical_bars'))
                            .height
                        )
                        monotonic: int = valid_races.height - identical
                        log.info(
                            f'Valid race conditions: {valid_races.height}\n'
                            f'  - Identical bars: {identical}\n'
                            f'  - Volume monotonic: {monotonic}\n'
                        )
                    if dq_issues is not None:
                        log.warning(
                            f'DATA QUALITY ISSUES from provider: '
                            f'{dq_issues.height} timestamp(s)\n'
                            f'{dq_issues}\n'
                )
                # detect gaps from in expected (uniform OHLC) sample period
@ -486,8 +460,7 @@ def ldshm(
                    # TODO: actually pull the exact duration
                    # expected for each venue operational period?
-                    # gap_dt_unit='day',
+                    gap_dt_unit='days',
                    gap_dt_unit='day',
                    gap_thresh=1,
                )
@ -561,13 +534,8 @@ def ldshm(
                        tf2aids[period_s] = aids
                else:
-                    # No significant gaps to handle, but may have had
+                    # allow interaction even when no ts problems.
-                    # duplicates removed (valid race conditions are ok)
+                    assert not diff
                    if diff > 0 and dq_issues is not None:
                        log.warning(
                            'Found duplicates with data quality issues '
                            'but no significant time gaps!\n'
                        )
            await tractor.pause()
            log.info('Exiting TSP shm anal-izer!')
--- a/piker/tsp/init.py
+++ b/piker/tsp/init.py
@ -40,9 +40,6 @@ from ._anal import (
    # `numpy` only
    slice_from_time as slice_from_time,
 )
 from ._dedupe_smart import (
    dedupe_ohlcv_smart as dedupe_ohlcv_smart,
 )
 from ._history import (
    iter_dfs_from_shms as iter_dfs_from_shms,
    manage_history as manage_history,
--- a/piker/tsp/_anal.py
+++ b/piker/tsp/_anal.py
@ -578,22 +578,11 @@ def detect_time_gaps(
    # NOTE: this flag is to indicate that on this (sampling) time
    # scale we expect to only be filtering against larger venue
    # closures-scale time gaps.
    #
    # Map to total_ method since `dt_diff` is a duration type,
    # not datetime - modern polars requires `total_*` methods
    # for duration types (e.g. `total_days()` not `day()`)
    # Ensure plural form for polars API (e.g. 'day' -> 'days')
    unit_plural: str = (
        gap_dt_unit
        if gap_dt_unit.endswith('s')
        else f'{gap_dt_unit}s'
    )
    duration_method: str = f'total_{unit_plural}'
    return step_gaps.filter(
        # Second by an arbitrary dt-unit step size
        getattr(
            pl.col('dt_diff').dt,
-            duration_method,
+            gap_dt_unit,
        )().abs() > gap_thresh
    )
--- a/piker/tsp/_dedupe_smart.py
+++ b/piker/tsp/_dedupe_smart.py
@ -1,206 +0,0 @@
 '''
 Smart OHLCV deduplication with data quality validation.
 Handles concurrent write conflicts by keeping the most complete bar
 (highest volume) while detecting data quality anomalies.
 '''
 import polars as pl
 from ._anal import with_dts
 def dedupe_ohlcv_smart(
    src_df: pl.DataFrame,
    time_col: str = 'time',
    volume_col: str = 'volume',
    sort: bool = True,
 ) -> tuple[
    pl.DataFrame,  # with dts
    pl.DataFrame,  # deduped (keeping higher volume bars)
    int,  # count of dupes removed
    pl.DataFrame|None,  # valid race conditions
    pl.DataFrame|None,  # data quality violations
 ]:
    '''
    Smart OHLCV deduplication keeping most complete bars.
    For duplicate timestamps, keeps bar with highest volume under
    the assumption that higher volume indicates more complete/final
    data from backfill vs partial live updates.
    Returns
    -------
    Tuple of:
    - wdts: original dataframe with datetime columns added
    - deduped: deduplicated frame keeping highest-volume bars
    - diff: number of duplicate rows removed
    - valid_races: duplicates meeting expected race condition pattern
      (volume monotonic, OHLC ranges valid)
    - data_quality_issues: duplicates violating expected relationships
      indicating provider data problems
    '''
    wdts: pl.DataFrame = with_dts(src_df)
    # Find duplicate timestamps
    dupes: pl.DataFrame = wdts.filter(
        pl.col(time_col).is_duplicated()
    )
    if dupes.is_empty():
        # No duplicates, return as-is
        return (wdts, wdts, 0, None, None)
    # Analyze duplicate groups for validation
    dupe_analysis: pl.DataFrame = (
        dupes
        .sort([time_col, 'index'])
        .group_by(time_col, maintain_order=True)
        .agg([
            pl.col('index').alias('indices'),
            pl.col('volume').alias('volumes'),
            pl.col('high').alias('highs'),
            pl.col('low').alias('lows'),
            pl.col('open').alias('opens'),
            pl.col('close').alias('closes'),
            pl.col('dt').first().alias('dt'),
            pl.len().alias('count'),
        ])
    )
    # Validate OHLCV monotonicity for each duplicate group
    def check_ohlcv_validity(row) -> dict[str, bool]:
        '''
        Check if duplicate bars follow expected race condition pattern.
        For a valid live-update → backfill race:
        - volume should be monotonically increasing
        - high should be monotonically non-decreasing
        - low should be monotonically non-increasing
        - open should be identical (fixed at bar start)
        Returns dict of violation flags.
        '''
        vols: list = row['volumes']
        highs: list = row['highs']
        lows: list = row['lows']
        opens: list = row['opens']
        violations: dict[str, bool] = {
            'volume_non_monotonic': False,
            'high_decreased': False,
            'low_increased': False,
            'open_mismatch': False,
            'identical_bars': False,
        }
        # Check if all bars are identical (pure duplicate)
        if (
            len(set(vols)) == 1
            and len(set(highs)) == 1
            and len(set(lows)) == 1
            and len(set(opens)) == 1
        ):
            violations['identical_bars'] = True
            return violations
        # Check volume monotonicity
        for i in range(1, len(vols)):
            if vols[i] < vols[i-1]:
                violations['volume_non_monotonic'] = True
                break
        # Check high monotonicity (can only increase or stay same)
        for i in range(1, len(highs)):
            if highs[i] < highs[i-1]:
                violations['high_decreased'] = True
                break
        # Check low monotonicity (can only decrease or stay same)
        for i in range(1, len(lows)):
            if lows[i] > lows[i-1]:
                violations['low_increased'] = True
                break
        # Check open consistency (should be fixed)
        if len(set(opens)) > 1:
            violations['open_mismatch'] = True
        return violations
    # Apply validation
    dupe_analysis = dupe_analysis.with_columns([
        pl.struct(['volumes', 'highs', 'lows', 'opens'])
        .map_elements(
            check_ohlcv_validity,
            return_dtype=pl.Struct([
                pl.Field('volume_non_monotonic', pl.Boolean),
                pl.Field('high_decreased', pl.Boolean),
                pl.Field('low_increased', pl.Boolean),
                pl.Field('open_mismatch', pl.Boolean),
                pl.Field('identical_bars', pl.Boolean),
            ])
        )
        .alias('validity')
    ])
    # Unnest validity struct
    dupe_analysis = dupe_analysis.unnest('validity')
    # Separate valid races from data quality issues
    valid_races: pl.DataFrame|None = (
        dupe_analysis
        .filter(
            # Valid if no violations OR just identical bars
            ~pl.col('volume_non_monotonic')
            & ~pl.col('high_decreased')
            & ~pl.col('low_increased')
            & ~pl.col('open_mismatch')
        )
    )
    if valid_races.is_empty():
        valid_races = None
    data_quality_issues: pl.DataFrame|None = (
        dupe_analysis
        .filter(
            # Issues if any non-identical violation exists
            (
                pl.col('volume_non_monotonic')
                | pl.col('high_decreased')
                | pl.col('low_increased')
                | pl.col('open_mismatch')
            )
            & ~pl.col('identical_bars')
        )
    )
    if data_quality_issues.is_empty():
        data_quality_issues = None
    # Deduplicate: keep highest volume bar for each timestamp
    deduped: pl.DataFrame = (
        wdts
        .sort([time_col, volume_col])
        .unique(
            subset=[time_col],
            keep='last',
            maintain_order=False,
        )
    )
    # Re-sort by time or index
    if sort:
        deduped = deduped.sort(by=time_col)
    diff: int = wdts.height - deduped.height
    return (
        wdts,
        deduped,
        diff,
        valid_races,
        data_quality_issues,
    )
--- a/snippets/claude_debug_helper.py
+++ b/snippets/claude_debug_helper.py
@ -1,256 +0,0 @@
 #!/usr/bin/env python
 '''
 Programmatic debugging helper for `pdbp` REPL human-like
 interaction but built to allow `claude` to interact with 
 crashes and `tractor.pause()` breakpoints along side a human dev.
 Originally written by `clauded` during a backfiller inspection
 session with @goodboy trying to resolve duplicate/gappy ohlcv ts
 issues discovered while testing the new `nativedb` tsdb.
 Allows `claude` to run `pdb` commands and capture output in an "offline"
 manner but generating similar output as if it was iteracting with
 the debug REPL.
 The use of `pexpect` is heavily based on tractor's REPL UX test
 suite(s), namely various `tests/devx/test_debugger.py` patterns.
 '''
 import sys
 import os
 import time
 import pexpect
 from pexpect.exceptions import (
    TIMEOUT,
    EOF,
 )
 PROMPT: str = r'\(Pdb\+\)'
 def expect(
    child: pexpect.spawn,
    patt: str,
    **kwargs,
 ) -> None:
    '''
    Expect wrapper that prints last console data before failing.
    '''
    try:
        child.expect(
            patt,
            **kwargs,
        )
    except TIMEOUT:
        before: str = (
            str(child.before.decode())
            if isinstance(child.before, bytes)
            else str(child.before)
        )
        print(
            f'TIMEOUT waiting for pattern: {patt}\n'
            f'Last seen output:\n{before}'
        )
        raise
 def run_pdb_commands(
    commands: list[str],
    initial_cmd: str = 'piker store ldshm xmrusdt.usdtm.perp.binance',
    timeout: int = 30,
    print_output: bool = True,
 ) -> dict[str, str]:
    '''
    Spawn piker process, wait for pdb prompt, execute commands.
    Returns dict mapping command -> output.
    '''
    results: dict[str, str] = {}
    # Disable colored output for easier parsing
    os.environ['PYTHON_COLORS'] = '0'
    # Spawn the process
    if print_output:
        print(f'Spawning: {initial_cmd}')
    child: pexpect.spawn = pexpect.spawn(
        initial_cmd,
        timeout=timeout,
        encoding='utf-8',
        echo=False,
    )
    # Wait for pdb prompt
    try:
        expect(child, PROMPT, timeout=timeout)
        if print_output:
            print('Reached pdb prompt!')
        # Execute each command
        for cmd in commands:
            if print_output:
                print(f'\n>>> {cmd}')
            child.sendline(cmd)
            time.sleep(0.1)
            # Wait for next prompt
            expect(child, PROMPT, timeout=timeout)
            # Capture output (everything before the prompt)
            output: str = (
                str(child.before.decode())
                if isinstance(child.before, bytes)
                else str(child.before)
            )
            results[cmd] = output
            if print_output:
                print(output)
        # Quit debugger gracefully
        child.sendline('quit')
        try:
            child.expect(EOF, timeout=5)
        except (TIMEOUT, EOF):
            pass
    except TIMEOUT as e:
        print(f'Timeout: {e}')
        if child.before:
            before: str = (
                str(child.before.decode())
                if isinstance(child.before, bytes)
                else str(child.before)
            )
            print(f'Buffer:\n{before}')
        results['_error'] = str(e)
    finally:
        if child.isalive():
            child.close(force=True)
    return results
 class InteractivePdbSession:
    '''
    Interactive pdb session manager for incremental debugging.
    '''
    def __init__(
        self,
        cmd: str = 'piker store ldshm xmrusdt.usdtm.perp.binance',
        timeout: int = 30,
    ):
        self.cmd: str = cmd
        self.timeout: int = timeout
        self.child: pexpect.spawn|None = None
        self.history: list[tuple[str, str]] = []
    def start(self) -> None:
        '''
        Start the piker process and wait for first prompt.
        '''
        os.environ['PYTHON_COLORS'] = '0'
        print(f'Starting: {self.cmd}')
        self.child = pexpect.spawn(
            self.cmd,
            timeout=self.timeout,
            encoding='utf-8',
            echo=False,
        )
        # Wait for initial prompt
        expect(self.child, PROMPT, timeout=self.timeout)
        print('Ready at pdb prompt!')
    def run(
        self,
        cmd: str,
        print_output: bool = True,
    ) -> str:
        '''
        Execute a single pdb command and return output.
        '''
        if not self.child or not self.child.isalive():
            raise RuntimeError('Session not started or dead')
        if print_output:
            print(f'\n>>> {cmd}')
        self.child.sendline(cmd)
        time.sleep(0.1)
        # Wait for next prompt
        expect(self.child, PROMPT, timeout=self.timeout)
        output: str = (
            str(self.child.before.decode())
            if isinstance(self.child.before, bytes)
            else str(self.child.before)
        )
        self.history.append((cmd, output))
        if print_output:
            print(output)
        return output
    def quit(self) -> None:
        '''
        Exit the debugger and cleanup.
        '''
        if self.child and self.child.isalive():
            self.child.sendline('quit')
            try:
                self.child.expect(EOF, timeout=5)
            except (TIMEOUT, EOF):
                pass
            self.child.close(force=True)
    def __enter__(self):
        self.start()
        return self
    def __exit__(self, *args):
        self.quit()
 if __name__ == '__main__':
    # Example inspection commands
    inspect_cmds: list[str] = [
        'locals().keys()',
        'type(deduped)',
        'deduped.shape',
        (
            'step_gaps.shape '
            'if "step_gaps" in locals() '
            'else "N/A"'
        ),
        (
            'venue_gaps.shape '
            'if "venue_gaps" in locals() '
            'else "N/A"'
        ),
    ]
    # Allow commands from CLI args
    if len(sys.argv) > 1:
        inspect_cmds = sys.argv[1:]
    # Interactive session example
    with InteractivePdbSession() as session:
        for cmd in inspect_cmds:
            session.run(cmd)
    print('\n=== Session Complete ===')