cocrawler · malteos · Aug 15, 2025 · Aug 19, 2025 · Aug 20, 2025 · Aug 22, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -8,26 +8,39 @@ on:
     branches:
     - main
 
+# These permissions are needed to interact with AWS S3 via GitHub's OIDC Token endpoint
+permissions:
+  id-token: write
+  contents: read
+  pull-requests: read
+
 jobs:
   unit-tests:
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: true
       matrix:
+        # The full test-suite is only run with os=ubuntu and py=3.12
         python-version: [
-          '3.8', '3.9', '3.10', '3.11', '3.12', '3.13'
+          '3.8', 
+          '3.9', 
+          '3.10', 
+          '3.11', 
+          '3.12', 
+          '3.13'
         ]
         os: [ubuntu-latest]
         EXTRA: [false]  # used to force includes to get included
         include:
           - python-version: '3.8'
             os: ubuntu-22.04  # oldest version on github actions
             EXTRA: true
-          - python-version: '3.13'
-            os: ubuntu-latest
-            env:
-              LOGLEVEL=DEBUG
-            EXTRA: true
+          # disabled (duplicated matrix entry)
+          # - python-version: '3.13'
+          #   os: ubuntu-latest
+          #   env:
+          #     LOGLEVEL=DEBUG
+          #   EXTRA: true
           - python-version: '3.13'
             os: macos-latest
             EXTRA: true
@@ -57,6 +70,20 @@ jobs:
       - name: Install cdx_toolkit
         run: pip install .[test]
 
+      - name: Configure AWS credentials from OIDC (disabled for forks)
+        if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'        
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::837454214164:role/GitHubActions-Role
+          aws-region: us-east-1
+
+      - name: Disable S3 unit tests for Python 3.8 (boto3 requires Python 3.9+)
+        if: ${{ startsWith(matrix.python-version, '3.8') }}
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('CDXT_DISABLE_S3_TESTS', '1')
+
       - name: Run tests
         run: |
           make test_coverage

diff --git a/README.md b/README.md
@@ -256,6 +256,112 @@ get the most recent N captures: --limit and limit= will return the
 oldest N captures. With the 'mixed' ordering, a large enough limit=
 will get close to returning the most recent N captures.
 
+## Filtering CDX files
+
+The command line cdxt can be used to filter CDX files based on a given
+whitelist of URLs or SURTs. In particular, the filtering process
+extracts all CDX entries that match with at least one entry in the 
+whitelist. All other CDX entries are discarded. 
+
+For matching, all URLs are converted into SURTs. A match occurs
+when a given SURT from the CDX file starts with one of the prefixes
+defined in the SURTS of whitelist.
+
+The CDX filter can read and write files from local and remote file 
+systems, like S3 buckets. Multiple input files can be defined
+using a glob pattern.
+
+```
+$ cdx filter_cdx <input_cdx_path> <whitelist_path> \
+    --filter-type <url or surt> \
+    [--input-glob <glob pattern like "*_cdx-*.gz"]
+```
+
+For example, you can filter CDX from Common Crawl as follows:
+
+```
+$ cdxt -v filter_cdx \
+    s3://commoncrawl/cc-index/collections \
+    /local/path/to/my-url-whitelist.txt \
+    s3://my-s3-bucket/filtered-cdxs --filter-type url \
+    --input-glob "/CC-MAIN-2024-30/indexes/*.gz" --overwrite
+```
+
+The whitelist file looks like this (one entry per line):
+
+```
+example.com
+github.com/cococrawler
+```
+
+Filtering throughput depends on your machine. For reference,
+on an AWS EC2 c5n.xlarge instance filtering all 300 CDX files 
+from CC-MAIN-2024-30 takes ~1.4 hours with 100k URLs in the whitelist. 
+
+## WARC extraction using CDX files
+
+You can extract parts of WARC files using the cdxt command line script.
+The WARC extraction can read CDX files from local and remote file 
+systems, like S3 buckets. Multiple CDX files can be defined
+using a glob pattern. For downloading WARC parts from HTTP or S3, you can 
+define the download prefix, e.g., `s3://commoncrawl` for S3 download.
+
+```
+$ cdxt -v --cc  warc_by_cdx \
+    <path_to_cdx> [--cdx-glob <glob pattern, e.g., "*.gz">] \
+    --prefix <output prefix> \
+    --warc-download-prefix=<warc download prefix, e.g., s3://commoncrawl> \
+    --creator <name and contact of creator> \
+    --operator <name and contact of creator> \
+    [--implementation <fsspec or aiobot3, defaults to fsspec>]
+    [--write-paths-as-resource-records <one or more paths for resource records>]
+    [--write-paths-as-resource-records-metadata <one or more paths for metadata of resource records>]
+```
+
+By default, we use a [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html) 
+implementation to write and read to local or remote file systems. 
+For better throughput for S3 read/write, we have also a specific implementation 
+using [aioboto3](https://github.com/terricain/aioboto3) that you can enable with 
+the `--implementation=aioboto3` argument. With aioboto3, we achieved ~ 80 requests / second 
+on an AWS EC2 c5n.xlarge instance.
+
+You can add one or multiple files with metadata as resource records to 
+the extracted WARC. For instance, this is useful to maintain the CDX filter 
+inputs, e.g., the whitelist list. To do this, you need to provide the 
+corresponding file paths as arguments `--write-paths-as-resource-records=s3:///my-s3-bucket/path/to/my-url-whitelist.txt`
+and `--write-paths-as-resource-records-metadata=s3:///my-s3-bucket/path/to/metadata.json`. 
+The metadata file is optional and can have the following optional fields:
+
+```json
+{
+    "warc_content_type": "str",
+    "uri": "str",
+    "http_headers": {"k": "v"},
+    "warc_headers_dict": {"k": "v"}
+}
+```
+
+This in one example for a metadata JSON file:
+
+```json
+{
+    "uri": "filter_cdx.gz",
+    "warc_content_type": "application/cdx",
+}
+```
+
+The full WARC extraction command could look like this:
+
+```
+$ cdxt -v --cc  warc_by_cdx \
+    s3://my-s3-bucket/filtered-cdxs --cdx-glob "*.gz" \
+    --prefix /local/path/filtered-warcs/ \
+    --warc-download-prefix=s3://commoncrawl \
+    --creator foo --operator bob \
+    --write-paths-as-resource-records=s3:///my-s3-bucket/path/to/my-url-whitelist.txt \
+    --write-paths-as-resource-records-metadata=s3:///my-s3-bucket/path/to/metadata.json
+```
+
 ## TODO
 
 Content downloading needs help with charset issues, preferably

diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py
@@ -6,7 +6,15 @@
 import os
 
 import cdx_toolkit
-from cdx_toolkit.commoncrawl import normalize_crawl
+
+from cdx_toolkit.utils import get_version, setup
+
+from cdx_toolkit.filter_cdx import run_filter_cdx
+from cdx_toolkit.filter_cdx.args import add_filter_cdx_args
+
+from cdx_toolkit.filter_warc import run_warcer_by_cdx
+from cdx_toolkit.filter_warc.args import add_warcer_by_cdx_args
+
 
 LOGGER = logging.getLogger(__name__)
 
@@ -54,6 +62,14 @@ def main(args=None):
     warc.add_argument('url')
     warc.set_defaults(func=warcer)
 
+    warc_by_cdx = subparsers.add_parser('warc_by_cdx', help='iterate over capture content based on an CDX index file, creating a warc')
+    add_warcer_by_cdx_args(warc_by_cdx)
+    warc_by_cdx.set_defaults(func=run_warcer_by_cdx)
+
+    filter_cdx = subparsers.add_parser('filter_cdx', help='Filter CDX files based on SURT prefixes whitelist')
+    add_filter_cdx_args(filter_cdx)
+    filter_cdx.set_defaults(func=run_filter_cdx)
+
     size = subparsers.add_parser('size', help='imprecise count of how many results are available')
     size.add_argument('--details', action='store_true', help='show details of each subindex')
     size.add_argument('url')
@@ -89,48 +105,6 @@ def set_loglevel(cmd):
     LOGGER.info('set loglevel to %s', str(loglevel))
 
 
-def get_version():
-    return cdx_toolkit.__version__
-
-
-def setup(cmd):
-    kwargs = {}
-    kwargs['source'] = 'cc' if cmd.crawl else cmd.cc or cmd.ia or cmd.source or None
-    if kwargs['source'] is None:
-        raise ValueError('must specify --cc, --ia, or a --source')
-    if cmd.wb:
-        kwargs['wb'] = cmd.wb
-    if cmd.cc_mirror:
-        kwargs['cc_mirror'] = cmd.cc_mirror
-    if cmd.crawl:
-        kwargs['crawl'] = normalize_crawl([cmd.crawl])  # currently a string, not a list
-    if getattr(cmd, 'warc_download_prefix', None) is not None:
-        kwargs['warc_download_prefix'] = cmd.warc_download_prefix
-
-    cdx = cdx_toolkit.CDXFetcher(**kwargs)
-
-    kwargs = {}
-    if cmd.limit:
-        kwargs['limit'] = cmd.limit
-    if 'from' in vars(cmd) and vars(cmd)['from']:  # python, uh, from is a reserved word
-        kwargs['from_ts'] = vars(cmd)['from']
-    if cmd.to:
-        kwargs['to'] = cmd.to
-    if cmd.closest:
-        if not cmd.get:  # pragma: no cover
-            LOGGER.info('note: --closest works best with --get')
-        kwargs['closest'] = cmd.closest
-    if cmd.filter:
-        kwargs['filter'] = cmd.filter
-
-    if cmd.cmd == 'warc' and cmd.size:
-        kwargs['size'] = cmd.size
-
-    if cmd.cmd == 'size' and cmd.details:
-        kwargs['details'] = cmd.details
-
-    return cdx, kwargs
-
 
 def winnow_fields(cmd, fields, obj):
     if cmd.all_fields:
@@ -213,9 +187,15 @@ def warcer(cmd, cmdline):
             LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp)
         writer.write_record(record)
 
+    writer.close()
+
 
 def sizer(cmd, cmdline):
     cdx, kwargs = setup(cmd)
 
     size = cdx.get_size_estimate(cmd.url, **kwargs)
     print(size)
+
+
+if __name__ == "__main__":
+    main()