API Reference

`scrape_met_paintings(params, limit=200, output='data/dbs/art_db.json')`

Scrape artwork metadata and images from the Met Museum public API.

Searches the Met collection using the given query parameters, then for each result fetches object metadata and downloads the primary image if it is public domain. Images are saved to data/images/<id>.jpg.

Parameters:

Name	Type	Description	Default
`params`	`dict`	Query parameters forwarded to the Met search endpoint (e.g. `isHighlight`, `departmentId`, `q`).	required
`limit`	`int`	Maximum number of object IDs to process. Defaults to 200.	`200`
`output`	`str`	Path to the output JSON file. Defaults to `data/raw/art_db.json`.	`'data/dbs/art_db.json'`

Source code in the_met_art_dataset/scraper.py

def scrape_met_paintings(
    params: dict, 
    limit: int = 200, 
    output: str = "data/dbs/art_db.json"
    ):
    """Scrape artwork metadata and images from the Met Museum public API.

    Searches the Met collection using the given query parameters, then for each
    result fetches object metadata and downloads the primary image if it is
    public domain. Images are saved to ``data/images/<id>.jpg``.

    Args:
        params: Query parameters forwarded to the Met search endpoint
            (e.g. ``isHighlight``, ``departmentId``, ``q``).
        limit: Maximum number of object IDs to process. Defaults to 200.
        output: Path to the output JSON file. Defaults to ``data/raw/art_db.json``.
    """
    # 1. Search for Highlights in 'Paintings' medium
    url = "https://collectionapi.metmuseum.org/public/collection/v1/search"


    response = requests.get(url, params=params)
    print(f"Full URL sent by Python: {response.url}")
    ids = response.json().get('objectIDs', [])[:limit]
    # print(ids)
    db = []
    os.makedirs("data/images", exist_ok=True)

    for oid in ids:
        # 2. Get Object Data
        response = requests.get(f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{oid}")
        if response.status_code == 200:
            try:
                obj = response.json()
            except requests.exceptions.JSONDecodeError:
                print(f"⚠️ Error decoding JSON for ID {oid}. Skipping.")
                continue
        else:
            print(f"❌ Failed to fetch ID {oid}: Status {response.status_code}. Skipping.")
            # If you get a 429 (Too Many Requests), wait a bit longer
            if response.status_code == 429:
                print("         🚦 Rate limit hit. Sleeping for 5 seconds...")
                time.sleep(5)
            elif response.status_code == 403:
                print(f"        🔒 Access forbidden. Skipping.")
            continue

        # 3. Check for image and Public Domain
        if obj.get('primaryImage') and obj.get('isPublicDomain'):
            img_url = obj.get('primaryImage')
            file_name = f"data/images/{oid}.jpg"

            #4. Download Image
            img_data = requests.get(img_url).content
            with open(file_name, 'wb') as f:
                f.write(img_data)

            # 5. Save Metadata
            db.append({
                "id": oid,
                "title": obj.get('title'),
                "author": obj.get('artistDisplayName'),
                "artist_nationality": obj.get('artistNationality', "Unknown"),
                "artist_gender": obj.get('artistGender', ""),
                "artist_display_bio": obj.get('artistDisplayBio', ""),
                "artist_begin_date": obj.get('artistBeginDate', ""),
                "artist_end_date": obj.get('artistEndDate', ""),
                "object_name": obj.get('objectName', ""),
                "object_year_desc": obj.get('objectDate'),
                "object_begin_date": obj.get('objectBeginDate'),
                "object_end_date": obj.get('objectEndDate'),
                "medium": obj.get('medium'),
                "classification": obj.get('classification'),
                "department": obj.get('department'),
                "period": obj.get('period', ""),
                "culture": obj.get('culture', ""),
                "repository": obj.get('repository', "Metropolitan Museum of Art"),
                "primary_image_url": obj.get('primaryImage'),
                "primary_image_small": obj.get('primaryImageSmall'),
                "local_path": file_name
                        })
            print(f"✅ Saved '{obj.get('title')}' as {oid}.jpg")
            time.sleep(0.5) # Be kind to their servers!
        else:
            print(f"Photo {oid} not of public domain")

    with open(output, "w") as f:
        json.dump(db, f, indent=4)

Filter Met Museum artwork records by department keyword.

`filter_by_department(source, output, exclude)`

Filter artwork records by excluding a department keyword.

Reads a JSON database of artwork entries, removes any records whose department field contains the given keyword (case-insensitive), and writes the cleaned dataset to a new file.

Parameters:

Name	Type	Description	Default
`source`	`str`	Path to the input JSON file.	required
`output`	`str`	Path to write the filtered JSON file.	required
`exclude`	`str`	Keyword to exclude from the `department` field.	required

Source code in the_met_art_dataset/filter.py

def filter_by_department(source: str, output: str, exclude: str) -> None:
    """Filter artwork records by excluding a department keyword.

    Reads a JSON database of artwork entries, removes any records whose
    ``department`` field contains the given keyword (case-insensitive), and
    writes the cleaned dataset to a new file.

    Args:
        source: Path to the input JSON file.
        output: Path to write the filtered JSON file.
        exclude: Keyword to exclude from the ``department`` field.
    """
    with open(source, 'r', encoding='utf-8') as f:
        data = json.load(f)

    filtered_data = [
        item for item in data
        if exclude.lower() not in item.get('department', '').lower()
    ]

    with open(output, 'w', encoding='utf-8') as f:
        json.dump(filtered_data, f, indent=4, ensure_ascii=False)

    print(f"Original entries: {len(data)}")
    print(f"Cleaned entries:  {len(filtered_data)}")
    print(f"Removed:          {len(data) - len(filtered_data)}")