Creating CIF files from Optimade API

Hello!

For a project I am working on I am generating a database of CIF files for as much crystals as possible. I have been playing around with the Optimade API but often find that I am unable to retrieve enough information to generate a CIF file… I was wondering if anyone had any experience generating a CIF file from the Optimade database?

By default, each database can choose which fields they respond with, so they might e.g. exclude site positions if they want to reduce server load. Is that the issue you are facing?

Depending on how you are creating your CIFs, you might need to add something like response_fields=cartesian_site_positions,lattice_vectors,species_at_sites etc to your queries (basically any field you need).

There’s a somewhat rudimentary OPTIMADE → CIF converter available at optimade-python-tools/optimade/adapters/structures/cif.py at a73c67c3dd1a5123acde7ce374acf61b13530a3e · Materials-Consortia/optimade-python-tools · GitHub (based on ASE) which might be helpful to you.

I’m currently working on the following script to generate the CIF files (which uses the cig_gen converter):


from optimade.adapters.structures.utils import (
    cell_to_cellpar,
    fractional_coordinates,
    valid_lattice_vector,
)
from optimade.models import Species as OptimadeStructureSpecies
from optimade.models import StructureResource as OptimadeStructure
import os
import pickle
import argparse
import pandas as pd
import requests
from tqdm import tqdm
from pymatgen.ext.optimade import OptimadeRester
from optimade.models import StructureResource

try:
    import numpy as np
except ImportError:
    from warnings import warn

    from optimade.adapters.warnings import AdapterPackageNotFound

    np = None  # type: ignore[assignment]
    NUMPY_NOT_FOUND = "NumPy not found, cannot convert structure to CIF"


__all__ = ("get_cif",)

# API Endpoints dictionary
api_endpoints = {
    'aflow': 'https://aflow.org/API/optimade/',
    'alexandria': 'https://alexandria.icams.rub.de/pbe/',
    'alexandria.pbe': 'https://alexandria.icams.rub.de/pbe/',
    'alexandria.pbesol': 'https://alexandria.icams.rub.de/pbesol/',
    'cod': 'https://www.crystallography.net/cod/optimade/',
    'cmr': 'https://cmr-optimade.fysik.dtu.dk/',
    'mcloud.mc3d': 'https://aiida.materialscloud.org/mc3d/optimade/',
    'mcloud.mc2d': 'https://aiida.materialscloud.org/mc2d/optimade/',
    'mcloud.2dtopo': 'https://aiida.materialscloud.org/2dtopo/optimade/',
    'mcloud.tc-applicability': 'https://aiida.materialscloud.org/tc-applicability/optimade/',
    'mcloud.pyrene-mofs': 'https://aiida.materialscloud.org/pyrene-mofs/optimade/',
    'mcloud.curated-cofs': 'https://aiida.materialscloud.org/curated-cofs/optimade/',
    'mcloud.stoceriaitf': 'https://aiida.materialscloud.org/stoceriaitf/optimade/',
    'mcloud.scdm': 'https://aiida.materialscloud.org/autowannier/optimade/',
    'mcloud.tin-antimony-sulfoiodide': 'https://aiida.materialscloud.org/tin-antimony-sulfoiodide/optimade/',
    'mcloud.optimade-sample': 'https://aiida.materialscloud.org/optimade-sample/optimade/',
    'mp': 'https://optimade.materialsproject.org/',
    'mpdd': 'http://mpddoptimade.phaseslab.org/',
    'mpds': 'https://api.mpds.io/',
    'nmd': 'https://nomad-lab.eu/prod/rae/optimade/',
    'odbx': 'https://optimade.odbx.science/',
    'odbx.odbx_misc': 'https://optimade-misc.odbx.science/',
    'odbx.gnome': 'https://optimade-gnome.odbx.science/',
    'omdb.omdb_production': 'https://optimade.openmaterialsdb.se/',
    'oqmd': 'https://oqmd.org/optimade/',
    'jarvis': 'https://jarvis.nist.gov/optimade/jarvisdft/',
    'tcod': 'https://www.crystallography.net/tcod/optimade/',
    'twodmatpedia': 'http://optimade.2dmatpedia.org/'
}

def fetch_structures(api_name, base_url, page_limit=1):
    structures = []
    endpoint = f"{base_url}/structures"
    params = {'page_limit': page_limit}
    try:
        response = requests.get(endpoint, params=params)
        response.raise_for_status()
        data = response.json()
        structures = data.get("data", [])
    except requests.exceptions.RequestException as e:
        print(f"Error fetching structures from {api_name}: {e}")
    return structures


def get_cif(
    optimade_structure: OptimadeStructure,
) -> str:
    """Get CIF file as string from OPTIMADE structure.

    Parameters:
        optimade_structure: OPTIMADE structure.

    Returns:
        The CIF file as a single Python `str` object.

    """
    # NumPy is needed for calculations
    if globals().get("np", None) is None:
        warn(NUMPY_NOT_FOUND, AdapterPackageNotFound)
        return None  # type: ignore[return-value]

    cif = """#
# Created from an OPTIMADE structure.
#
"""

    cif += f"data_{optimade_structure.id}\n\n"

    attributes = optimade_structure.attributes

    # Do this only if there's three non-zero lattice vectors
    # NOTE: This also negates handling of lattice_vectors with null/None values
    if valid_lattice_vector(attributes.lattice_vectors):  # type:ignore[arg-type]
        a_vector, b_vector, c_vector, alpha, beta, gamma = cell_to_cellpar(
            attributes.lattice_vectors  # type: ignore[arg-type]
        )

        cif += (
            f"_cell_length_a                    {a_vector:g}\n"
            f"_cell_length_b                    {b_vector:g}\n"
            f"_cell_length_c                    {c_vector:g}\n"
            f"_cell_angle_alpha                 {alpha:g}\n"
            f"_cell_angle_beta                  {beta:g}\n"
            f"_cell_angle_gamma                 {gamma:g}\n\n"
        )
        cif += (
            "_symmetry_space_group_name_H-M    'P 1'\n"
            "_symmetry_int_tables_number       1\n\n"
            "loop_\n"
            "  _symmetry_equiv_pos_as_xyz\n"
            "  'x, y, z'\n\n"
        )

        # Since some structure viewers are having issues with cartesian coordinates,
        # we calculate the fractional coordinates if this is a 3D structure and we have all the necessary information.
        if not hasattr(attributes, "fractional_site_positions"):
            attributes.fractional_site_positions = fractional_coordinates(
                cell=attributes.lattice_vectors,  # type:ignore[arg-type]
                cartesian_positions=attributes.cartesian_site_positions,  # type:ignore[arg-type]
            )

    # NOTE: This is otherwise a bit ahead of its time, since this OPTIMADE property is part of an open PR.
    # See https://github.com/Materials-Consortia/OPTIMADE/pull/206
    coord_type = (
        "fract" if hasattr(attributes, "fractional_site_positions") else "Cartn"
    )

    cif += (
        "loop_\n"
        "  _atom_site_type_symbol\n"  # species.chemical_symbols
        "  _atom_site_label\n"  # species.name + unique int
        "  _atom_site_occupancy\n"  # species.concentration
        f"  _atom_site_{coord_type}_x\n"  # cartesian_site_positions
        f"  _atom_site_{coord_type}_y\n"  # cartesian_site_positions
        f"  _atom_site_{coord_type}_z\n"  # cartesian_site_positions
        "  _atom_site_thermal_displace_type\n"  # Set to 'Biso'
        "  _atom_site_B_iso_or_equiv\n"  # Set to 1.0:f
    )

    if coord_type == "fract":
        sites = attributes.fractional_site_positions
    else:
        sites = attributes.cartesian_site_positions

    species: dict[str, OptimadeStructureSpecies] = {
        species.name: species
        for species in attributes.species  # type: ignore[union-attr]
    }

    symbol_occurences: dict[str, int] = {}
    for site_number in range(attributes.nsites):  # type: ignore[arg-type]
        species_name = attributes.species_at_sites[site_number]  # type: ignore[index]
        site = sites[site_number]

        current_species = species[species_name]

        for index, symbol in enumerate(current_species.chemical_symbols):
            if symbol == "vacancy":
                continue

            if symbol in symbol_occurences:
                symbol_occurences[symbol] += 1
            else:
                symbol_occurences[symbol] = 1
            label = f"{symbol}{symbol_occurences[symbol]}"

            cif += (
                f"  {symbol} {label} {current_species.concentration[index]:6.4f} {site[0]:8.5f}  "
                f"{site[1]:8.5f}  {site[2]:8.5f}  {'Biso':4}  {'1.000':6}\n"
            )

    return cif

def save_cif(cif_text, output_dir, structure_id):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    file_path = os.path.join(output_dir, f"{structure_id}.cif")
    try:
        with open(file_path, 'w') as f:
            f.write(cif_text)
    except Exception as e:
        print(f"Error saving CIF for structure {structure_id}: {e}")

def compile_metadata(metadata_table, reduced_formula, database_name, cif_text):
    metadata_table.append({
        'reduced_formula': reduced_formula,
        'database': database_name,
        'cif_text': cif_text
    })

def save_pickle(metadata_table, filename):
    df = pd.DataFrame(metadata_table)
    if os.path.exists(filename):
        # Load existing data and append
        with open(filename, 'rb') as f:
            existing_df = pickle.load(f)
        df = pd.concat([existing_df, df], ignore_index=True)
    try:
        with open(filename, 'wb') as f:
            pickle.dump(df, f)
        print(f"Metadata successfully saved to {filename}")
    except Exception as e:
        print(f"Error saving pickle file: {e}")

def main():
    parser = argparse.ArgumentParser(description="Fetch structures from OPTIMADE databases and convert to CIF.")
    parser.add_argument('--database', nargs='+', required=True, help="List of databases to fetch data from.")
    parser.add_argument('--dataset', type=int, required=True, help="Number of entries to fetch from each database.")
    parser.add_argument('--output_dir', required=True, help="Directory to save CIF files and metadata pickle file.")
    args = parser.parse_args()

    output_cif_dir = os.path.join(args.output_dir, 'cif_files')
    output_pkl_file = os.path.join(args.output_dir, 'structures_metadata.pkl')

    metadata_table = []

    for db in args.database:
        if db not in api_endpoints:
            print(f"Database '{db}' not recognized. Skipping.")
            continue

        base_url = api_endpoints[db]
        print(f"Fetching structures from {db}...")
        structures = fetch_structures(db, base_url, page_limit=args.dataset)
        print(f"Number of structures fetched from {db}: {len(structures)}")

        for structure_dict in tqdm(structures, desc=f"Processing {db}", leave=False):
            try:
                structure = StructureResource(**structure_dict)
            except Exception as e:
                print(f"Error parsing structure: {e}")
                continue

            structure_id = structure.id
            attributes = structure.attributes
            reduced_formula = attributes.chemical_formula_reduced or 'N/A'

            try:
                cif_text = get_cif(structure)
            except Exception as e:
                print(f"Error generating CIF for structure {structure_id}: {e}")
                cif_text = None

            if cif_text:
                try:
                    save_cif(cif_text, output_cif_dir, structure_id)
                    compile_metadata(metadata_table, reduced_formula, db, cif_text)
                except Exception as e:
                    print(f"Error saving CIF or compiling metadata for structure {structure_id}: {e}")
            else:
                print(f"Skipping structure {structure_id} due to conversion issues.")

    # Save all metadata to a pickle file
    try:
        save_pickle(metadata_table, output_pkl_file)
    except Exception as e:
        print(f"Error saving metadata: {e}")

if __name__ == "__main__":
    main()

My issue is that it works for some databases (so far alexandria), but others like Jarvis I get errors like:

  • Sometimes the script works, sometimes it does not connect to the databases, and sometimes there is insufficient information to generate the CIF files
  • This morning for example I cant connect even though yesterday it was working
    Error fetching structures from jarvis: HTTPSConnectionPool(host=‘jarvis.nist.gov’, port=443): Max retries exceeded with url: /optimade/jarvisdft//structures?page_limit=50 (Caused by NewConnectionError(‘<urllib3.connection.HTTPSConnection object at 0x7f4d752707d0>: Failed to establish a new connection: [Errno 101] Network is unreachable’))

Thanks for the help!

Hi, if the issues are with individual databases then I recommend reporting it to them directly, or just skipping them. It doesn’t look like your script uses response_fields as suggested in my first post, so I would also try that.

If you want to be able to more robustly download “everything from everyone” I’d recommend the approach laid out here: Using the OPTIMADE client - OPTIMADE Python tools which will connect to all databases in parallel (you also won’t need to list them), and only error on the ones that are failing. Additionally it provides you with a way to cache the results in a local database, or map each entry to CIF as it comes in.

I have contacted the maintainers of Jarvis to see what the problem might be. Could you also indicate which other databases are problematic?

Thanks for all the advice! I have edited my script to incorporate all this and now it works and looks like this (for info):

from optimade.client import OptimadeClient
from optimade.models import Species as OptimadeStructureSpecies
from optimade.models import StructureResource as OptimadeStructure
from pymatgen.ext.optimade import OptimadeRester
from optimade.adapters.structures.utils import (
    cell_to_cellpar,
    fractional_coordinates,
    valid_lattice_vector,
)
import os
import pickle
import argparse
import pandas as pd
from tqdm import tqdm
from warnings import warn
import logging

try:
    import numpy as np
except ImportError:
    from optimade.adapters.warnings import AdapterPackageNotFound

    np = None
    NUMPY_NOT_FOUND = "NumPy not found, cannot convert structure to CIF"


# Callbacks for handling errors
def log_error(url, results):
    if results.get("errors"):
        logging.error(f"Errors encountered for {url}: {results['errors']}")


def get_cif(
    optimade_structure: OptimadeStructure,
) -> str:
    """Get CIF file as string from OPTIMADE structure."""

    # NumPy is needed for calculations
    if globals().get("np", None) is None:
        warn(NUMPY_NOT_FOUND, AdapterPackageNotFound)
        return None  # type: ignore[return-value]

    cif = """#
# Created from an OPTIMADE structure.
#
"""

    cif += f"data_{optimade_structure.id}\n\n"

    attributes = optimade_structure.attributes

    # Check for valid lattice vectors
    if valid_lattice_vector(attributes.lattice_vectors):
        a_vector, b_vector, c_vector, alpha, beta, gamma = cell_to_cellpar(
            attributes.lattice_vectors
        )

        cif += (
            f"_cell_length_a                    {a_vector:g}\n"
            f"_cell_length_b                    {b_vector:g}\n"
            f"_cell_length_c                    {c_vector:g}\n"
            f"_cell_angle_alpha                 {alpha:g}\n"
            f"_cell_angle_beta                  {beta:g}\n"
            f"_cell_angle_gamma                 {gamma:g}\n\n"
        )
        cif += (
            "_symmetry_space_group_name_H-M    'P 1'\n"
            "_symmetry_int_tables_number       1\n\n"
            "loop_\n"
            "  _symmetry_equiv_pos_as_xyz\n"
            "  'x, y, z'\n\n"
        )

        # Calculate fractional coordinates if not provided
        if not hasattr(attributes, "fractional_site_positions"):
            attributes.fractional_site_positions = fractional_coordinates(
                cell=attributes.lattice_vectors,
                cartesian_positions=attributes.cartesian_site_positions,
            )

    coord_type = (
        "fract" if hasattr(attributes, "fractional_site_positions") else "Cartn"
    )

    cif += (
        "loop_\n"
        "  _atom_site_type_symbol\n"  # species.chemical_symbols
        "  _atom_site_label\n"         # species.name + unique int
        "  _atom_site_occupancy\n"     # species.concentration
        f"  _atom_site_{coord_type}_x\n"  # site positions
        f"  _atom_site_{coord_type}_y\n"  # site positions
        f"  _atom_site_{coord_type}_z\n"  # site positions
        "  _atom_site_thermal_displace_type\n"  # Set to 'Biso'
        "  _atom_site_B_iso_or_equiv\n"          # Set to 1.0
    )

    if coord_type == "fract":
        sites = attributes.fractional_site_positions
    else:
        sites = attributes.cartesian_site_positions

    species: dict[str, OptimadeStructureSpecies] = {
        species.name: species
        for species in attributes.species
    }

    symbol_occurrences: dict[str, int] = {}
    for site_number in range(attributes.nsites):
        species_name = attributes.species_at_sites[site_number]
        site = sites[site_number]

        current_species = species[species_name]

        for index, symbol in enumerate(current_species.chemical_symbols):
            if symbol == "vacancy":
                continue

            if symbol in symbol_occurrences:
                symbol_occurrences[symbol] += 1
            else:
                symbol_occurrences[symbol] = 1
            label = f"{symbol}{symbol_occurrences[symbol]}"

            cif += (
                f"  {symbol} {label} {current_species.concentration[index]:6.4f} {site[0]:8.5f}  "
                f"{site[1]:8.5f}  {site[2]:8.5f}  {'Biso':4}  {'1.000':6}\n"
            )

    return cif



def save_cif(cif_text, output_dir, structure_id):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    file_path = os.path.join(output_dir, f"{structure_id}.cif")
    try:
        with open(file_path, 'w') as f:
            f.write(cif_text)
    except Exception as e:
        logging.error(f"Error saving CIF for structure {structure_id}: {e}")


def main():
    parser = argparse.ArgumentParser(
        description="Fetch structures from OPTIMADE databases and convert to CIF."
    )
    parser.add_argument(
        '--dataset',
        type=int,
        required=True,
        help="Number of entries to fetch from each database.",
    )
    parser.add_argument(
        '--output_dir',
        required=True,
        help="Directory to save CIF files and metadata pickle file.",
    )
    args = parser.parse_args()

    output_cif_dir = os.path.join(args.output_dir, 'cif_files')
    output_pkl_file = os.path.join(args.output_dir, 'structures_metadata.pkl')

    # Set up logging to a file in the output directory
    log_file = os.path.join(args.output_dir, 'outputs.log')
    logging.basicConfig(
        filename=log_file,
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

    metadata_table = []

    # Define the essential response fields
    response_fields = [
        'id',
        'lattice_vectors',
        'cartesian_site_positions',
        'species',
        'species_at_sites',
        'nsites',
        'last_modified',          # Added field
        'structure_features',     # Added field
        'elements',               # Optional: Include if needed
        'chemical_formula_descriptive',  # Optional: Include if needed
    ]

    # Set up OptimadeClient to fetch structures across multiple databases
    client = OptimadeClient(
        max_results_per_provider=args.dataset,
        callbacks=[log_error],
        use_async=True,
        silent=False,
        max_attempts=3,
        # exclude_providers=['tcod', 'cod'],
    )

    # Fetch structures from all available providers with specified response fields
    try:
        all_results = client.get(response_fields=response_fields)
        logging.info("Successfully fetched structures from OPTIMADE providers.")
    except Exception as e:
        logging.error(f"Error fetching structures: {e}")
        return

    # Correctly iterate over the results
    structures_results = all_results.get('structures', {})
    total_structures = 0

    # log structures from each database

    for filter_string, filter_results in structures_results.items():
        for base_url, results in tqdm(
            filter_results.items(), desc="Processing databases"
        ):

            if "errors" in results and results["errors"]:
                logging.error(f"Errors encountered for {base_url}: {results['errors']}")
                

            data = results.get("data", [])
            for structure_dict in data:
                try:
                    structure = OptimadeStructure(**structure_dict)
                except Exception as e:
                    logging.error(f"Error parsing structure: {e}")
                    

                structure_id = structure.id
                reduced_formula = (
                    structure.attributes.chemical_formula_reduced or 'N/A'
                )
                logging.info(f"Processing structure {structure_id} with formula {reduced_formula}")

                try:
                    cif_text = get_cif(structure)
                    if cif_text:
                        save_cif(cif_text, output_cif_dir, structure_id)
                        metadata_table.append(
                            {
                                'reduced_formula': reduced_formula,
                                'database': base_url,
                                'cif_text': cif_text,
                            }
                        )
                        logging.info(f"Successfully processed structure {structure_id} from {base_url}")
                        total_structures += 1
                except Exception as e:
                    logging.error(f"Error handling structure {structure_id}: {e}")

    # Save all metadata to a pickle file
    if metadata_table:
        df = pd.DataFrame(metadata_table)
        try:
            with open(output_pkl_file, 'wb') as f:
                pickle.dump(df, f)
            logging.info(f"Metadata successfully saved to {output_pkl_file}")
        except Exception as e:
            logging.error(f"Error saving metadata: {e}")
    else:
        logging.warning("No metadata to save.")

    logging.info(f"Total CIF files generated: {total_structures}")

if __name__ == '__main__':
    main()

I also have an error log that shows all the problematic cases but I can’t upload it as I’m a new user. If this is of interest I can send it over.

An example:
2024-10-29 17:29:56,212 - ERROR - Error parsing structure: 1 validation error for StructureResource
attributes
Value error, (‘relationships’, ‘links’, ‘id’, ‘type’) MUST NOT be fields under Attributes [type=value_error, input_value={‘cartesian_site_position…structure_features’: }, input_type=dict]
For further information visit Redirecting...
2024-10-29 17:29:56,212 - INFO - Processing structure agm003644889 with formula N/A
2024-10-29 17:29:56,212 - INFO - Successfully processed structure agm003644889 from https://www.crystallography.net/cod/optimade

If any more info is of interest let me know. Otherwise it now works and I’ve generated from all most all the database to thank you!