"""CIF loading helpers that preserve kMC site metadata.
The parser implementation is adapted from pymatgen; see cif_LICENSE.rst.
"""
from pymatgen.io.cif import CifParser
from pymatgen.core.operations import MagSymmOp, SymmOp
import warnings
from pymatgen.util.coord import find_in_coord_list_pbc, in_coord_list_pbc
from io import StringIO
import re
import numpy as np
from pymatgen.electronic_structure.core import Magmom
import math
from pymatgen.core.composition import Composition
from pymatgen.core.periodic_table import DummySpecies, Element, Species, get_el_sp
from pymatgen.symmetry.analyzer import SpacegroupOperations
from pymatgen.symmetry.structure import SymmetrizedStructure
from pymatgen.util.coord import find_in_coord_list_pbc
from monty.io import zopen
from pymatgen.core import Structure
class _LabeledCifParser(CifParser):
@staticmethod
def from_string(cif_string, **kwargs):
"""
Creates a CifParser from a string.
Args:
cif_string (str): String representation of a CIF.
**kwargs: Passthrough of all kwargs supported by CifParser.
Returns:
CifParser
"""
stream = StringIO(cif_string)
return _LabeledCifParser(stream, **kwargs)
def _get_labeled_structure(self, data, primitive, symmetrized):
"""modified version from _get_structure, which can add the atom label to the site
Args:
data (dict): data parsed by the cif parser
primitive (_type_): not implemented
symmetrized (bool): not implemented
"""
def get_num_implicit_hydrogens(sym):
num_h = {"Wat": 2, "wat": 2, "O-H": 1}
return num_h.get(sym[:3], 0)
lattice = self.get_lattice(data)
# if magCIF, get magnetic symmetry moments and magmoms
# else standard CIF, and use empty magmom dict
if self.feature_flags["magcif_incommensurate"]:
raise NotImplementedError(
"Incommensurate structures not currently supported."
)
if self.feature_flags["magcif"]:
self.symmetry_operations = self.get_magsymops(data)
magmoms = self.parse_magmoms(data, lattice=lattice)
else:
self.symmetry_operations = self.get_symops(data)
magmoms = {}
oxi_states = self._parse_oxi_states(data)
coord_to_species = {}
coord_to_magmoms = {}
# add the coords_to_labels to record the label (Na1,Na2,Zr,S/P, etc.)
coord_to_labels = {}
def get_matching_coord(coord):
keys = list(coord_to_species.keys())
coords = np.array(keys)
for op in self.symmetry_operations:
c = op.operate(coord)
inds = find_in_coord_list_pbc(coords, c, atol=self._site_tolerance)
# can't use if inds, because python is dumb and np.array([0]) evaluates
# to False
if len(inds):
return keys[inds[0]]
return False
for i in range(len(data["_atom_site_label"])):
try:
# If site type symbol exists, use it. Otherwise, we use the
# label.
symbol = self._parse_symbol(data["_atom_site_type_symbol"][i])
num_h = get_num_implicit_hydrogens(data["_atom_site_type_symbol"][i])
except KeyError:
symbol = self._parse_symbol(data["_atom_site_label"][i])
num_h = get_num_implicit_hydrogens(data["_atom_site_label"][i])
if not symbol:
continue
if oxi_states is not None:
o_s = oxi_states.get(symbol, 0)
# use _atom_site_type_symbol if possible for oxidation state
if "_atom_site_type_symbol" in data.data.keys():
oxi_symbol = data["_atom_site_type_symbol"][i]
o_s = oxi_states.get(oxi_symbol, o_s)
try:
el = Species(symbol, o_s)
except Exception:
el = DummySpecies(symbol, o_s)
else:
el = get_el_sp(symbol)
x = self.str2float(data["_atom_site_fract_x"][i])
y = self.str2float(data["_atom_site_fract_y"][i])
z = self.str2float(data["_atom_site_fract_z"][i])
magmom = magmoms.get(data["_atom_site_label"][i], np.array([0, 0, 0]))
try:
occu = self.str2float(data["_atom_site_occupancy"][i])
except (KeyError, ValueError):
occu = 1
if occu > 0:
coord = (x, y, z)
match = get_matching_coord(coord)
comp_d = {el: occu}
if num_h > 0:
comp_d["H"] = num_h
self.warnings.append(
"Structure has implicit hydrogens defined, "
"parsed structure unlikely to be suitable for use "
"in calculations unless hydrogens added."
)
comp = Composition(comp_d)
if not match:
coord_to_species[coord] = comp
coord_to_magmoms[coord] = magmom
coord_to_labels[coord] = data["_atom_site_label"][i]
else:
coord_to_species[match] += comp
# disordered magnetic not currently supported
coord_to_magmoms[match] = None
coord_to_labels[coord] += data["_atom_site_label"][i]
sum_occu = [
sum(c.values())
for c in coord_to_species.values()
if not set(c.elements) == {Element("O"), Element("H")}
]
if any(o > 1 for o in sum_occu):
msg = (
"Some occupancies ({}) sum to > 1! If they are within "
"the occupancy_tolerance, they will be rescaled. "
"The current occupancy_tolerance is set to: {}".format(
sum_occu, self._occupancy_tolerance
)
)
warnings.warn(msg)
self.warnings.append(msg)
allspecies = []
allcoords = []
allmagmoms = []
allhydrogens = []
equivalent_indices = []
site_properties = {}
# initialize the wyckoff_sequence here
site_properties["wyckoff_sequence"] = []
site_properties["local_index"] = []
site_properties["label"] = []
local_index = 0
# check to see if magCIF file is disordered
if self.feature_flags["magcif"]:
for k, v in coord_to_magmoms.items():
if v is None:
# Proposed solution to this is to instead store magnetic
# moments as Species 'spin' property, instead of site
# property, but this introduces ambiguities for end user
# (such as unintended use of `spin` and Species will have
# fictitious oxidation state).
raise NotImplementedError(
"Disordered magnetic structures not currently supported."
)
# print(coord_to_species)
if coord_to_species.items():
"""
for idx, (comp, group) in enumerate(
sorted(list(coord_to_species.items()), key=lambda x: x[1])
):
print(comp,group)
"""
for idx, (coord, comp) in enumerate(list(coord_to_species.items())):
# I delete the weird group function
# now just for every initial site, generate all sites
# print(idx,comp)#debug
tmp_coords = [coord] # follow the fashion
tmp_magmom = [coord_to_magmoms[tmp_coord] for tmp_coord in tmp_coords]
tmp_label = [coord_to_labels[tmp_coord] for tmp_coord in tmp_coords]
# print(tmp_coords,tmp_magmom,tmp_label)#debug
if self.feature_flags["magcif"]:
coords, magmoms, _ = self._unique_coords(
tmp_coords, magmoms_in=tmp_magmom, lattice=lattice
)
else:
coords, magmoms, _ = self._unique_coords(tmp_coords)
# add wyckoff sequence here
for wyckoff_sequence in range(0, len(coords)):
# tuple of wyckoff sequence, (label,sequence ) in format of (str,int)
site_properties["wyckoff_sequence"].append(wyckoff_sequence)
site_properties["label"].append(coord_to_labels[tmp_coords[0]])
site_properties["local_index"].append(local_index)
local_index += 1
if set(comp.elements) == {Element("O"), Element("H")}:
# O with implicit hydrogens
im_h = comp["H"]
species = Composition({"O": comp["O"]})
else:
im_h = 0
species = comp
# The following might be a more natural representation of equivalent indices,
# but is not in the format expect by SymmetrizedStructure:
# equivalent_indices.append(list(range(len(allcoords), len(coords)+len(allcoords))))
# The above gives a list like:
# [[0, 1, 2, 3], [4, 5, 6, 7, 8, 9, 10, 11]] where the
# integers are site indices, whereas the version used below will give a version like:
# [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
# which is a list in the same order as the sites, but where if a site has the same integer
# it is equivalent.
equivalent_indices += len(coords) * [idx]
allhydrogens.extend(len(coords) * [im_h])
allcoords.extend(coords)
allspecies.extend(len(coords) * [species])
allmagmoms.extend(magmoms)
# rescale occupancies if necessary
for i, species in enumerate(allspecies):
totaloccu = sum(species.values())
if 1 < totaloccu <= self._occupancy_tolerance:
allspecies[i] = species / totaloccu
if (
allspecies
and len(allspecies) == len(allcoords)
and len(allspecies) == len(allmagmoms)
):
# site_properties = {}
if any(allhydrogens):
assert len(allhydrogens) == len(allcoords)
site_properties["implicit_hydrogens"] = allhydrogens
if self.feature_flags["magcif"]:
site_properties["magmom"] = allmagmoms
if len(site_properties) == 0:
site_properties = None
struct = Structure(
lattice, allspecies, allcoords, site_properties=site_properties
)
if symmetrized:
# Wyckoff labels not currently parsed, note that not all CIFs will contain Wyckoff labels
# TODO: extract Wyckoff labels (or other CIF attributes) and include as site_properties
wyckoffs = ["Not Parsed"] * len(struct)
# Names of space groups are likewise not parsed (again, not all CIFs will contain this information)
# What is stored are the lists of symmetry operations used to generate the structure
# TODO: ensure space group labels are stored if present
sg = SpacegroupOperations("Not Parsed", -1, self.symmetry_operations)
return SymmetrizedStructure(struct, sg, equivalent_indices, wyckoffs)
struct = struct.get_sorted_structure()
if primitive and self.feature_flags["magcif"]:
struct = struct.get_primitive_structure(use_site_props=True)
elif primitive:
struct = struct.get_primitive_structure()
struct = struct.get_reduced_structure()
return struct
def get_labeled_structures(self, primitive=True, symmetrized=False):
"""
This is a modified version of get_structures for KMCPY, which preserve the label and the wyckoff sequence information.
2022.03.30
Return list of structures in CIF file. primitive boolean sets whether a
conventional cell structure or primitive cell structure is returned.
Args:
primitive (bool): Set to False to return conventional unit cells.
Defaults to True. With magnetic CIF files, will return primitive
magnetic cell which may be larger than nuclear primitive cell.
symmetrized (bool): If True, return a SymmetrizedStructure which will
include the equivalent indices and symmetry operations used to
create the Structure as provided by the CIF (if explicit symmetry
operations are included in the CIF) or generated from information
in the CIF (if only space group labels are provided). Note that
currently Wyckoff labels and space group labels or numbers are
not included in the generated SymmetrizedStructure, these will be
notated as "Not Parsed" or -1 respectively.
Returns:
List of Structures.
"""
if primitive and symmetrized:
raise ValueError(
"Using both 'primitive' and 'symmetrized' arguments is not currently supported "
"since unexpected behavior might result."
)
structures = []
for i, d in enumerate(self._cif.data.values()):
try:
s = self._get_labeled_structure(d, primitive, symmetrized)
if s:
structures.append(s)
except (KeyError, ValueError) as exc:
# Warn the user (Errors should never pass silently)
# A user reported a problem with cif files produced by Avogadro
# in which the atomic coordinates are in Cartesian coords.
self.warnings.append(str(exc))
warnings.warn(
f"No structure parsed for {i + 1} structure in CIF. Section of CIF file below."
)
warnings.warn(str(d))
warnings.warn(f"Error is {str(exc)}.")
if self.warnings:
warnings.warn(
"Issues encountered while parsing CIF: %s" % "\n".join(self.warnings)
)
if len(structures) == 0:
raise ValueError("Invalid cif file with no structures!")
return structures
def _unique_coords(
self,
coords,
magmoms=None,
lattice=None,
labels=None,
):
"""
Generate unique coordinates using coord and symmetry positions
and also their corresponding magnetic moments, if supplied.
"""
coords_out: list[np.ndarray] = []
labels_out = []
labels = labels or {}
if magmoms:
magmoms_out = []
if len(magmoms) != len(coords):
raise ValueError
for tmp_coord, tmp_magmom in zip(coords, magmoms):
for op in self.symmetry_operations:
coord = op.operate(tmp_coord)
coord = np.array([i - math.floor(i) for i in coord])
if isinstance(op, MagSymmOp):
# Up to this point, magmoms have been defined relative
# to crystal axis. Now convert to Cartesian and into
# a Magmom object.
magmom = Magmom.from_moment_relative_to_crystal_axes(
op.operate_magmom(tmp_magmom), lattice=lattice
)
else:
magmom = Magmom(tmp_magmom)
if not in_coord_list_pbc(
coords_out, coord, atol=self._site_tolerance
):
coords_out.append(coord)
magmoms_out.append(magmom)
labels_out.append(labels.get(tmp_coord))
return coords_out, magmoms_out, labels_out
for tmp_coord in coords:
for op in self.symmetry_operations:
coord = op.operate(tmp_coord)
coord = np.array([i - math.floor(i) for i in coord])
if not in_coord_list_pbc(coords_out, coord, atol=self._site_tolerance):
coords_out.append(coord)
labels_out.append(labels.get(tmp_coord))
dummy_magmoms = [Magmom(0)] * len(coords_out)
return coords_out, dummy_magmoms, labels_out
@staticmethod
def str2float(text):
"""
Remove uncertainty brackets from strings and return the float.
"""
try:
# Note that the ending ) is sometimes missing. That is why the code has
# been modified to treat it as optional. Same logic applies to lists.
return float(re.sub(r"\(.+\)*", "", text))
except TypeError:
if isinstance(text, list) and len(text) == 1:
return float(re.sub(r"\(.+\)*", "", text[0]))
except ValueError as ex:
if text.strip() == ".":
return 0
raise ex
raise ValueError(f"{text} cannot be converted to float")
[docs]
def load_labeled_structures_from_cif(
filename: str,
primitive: bool = False,
symmetrized: bool = False,
**parser_kwargs,
) -> list[Structure]:
"""Load structures from a CIF and preserve label/Wyckoff site metadata."""
with zopen(str(filename), "rt") as handle:
parser = _LabeledCifParser.from_string(handle.read(), **parser_kwargs)
return parser.get_labeled_structures(
primitive=primitive,
symmetrized=symmetrized,
)
[docs]
def load_labeled_structure_from_cif(
filename: str,
primitive: bool = False,
**parser_kwargs,
) -> Structure:
"""Load the first labeled structure from a CIF file."""
return load_labeled_structures_from_cif(
filename,
primitive=primitive,
**parser_kwargs,
)[0]
[docs]
def load_labeled_structure_from_string(
cif_string: str,
primitive: bool = False,
symmetrized: bool = False,
**parser_kwargs,
) -> Structure:
"""Load a labeled structure from CIF text."""
parser = _LabeledCifParser.from_string(cif_string, **parser_kwargs)
return parser.get_labeled_structures(
primitive=primitive,
symmetrized=symmetrized,
)[0]
__all__ = [
"load_labeled_structure_from_cif",
"load_labeled_structures_from_cif",
"load_labeled_structure_from_string",
]