Source code for pyslyde.slide

#!/usr/bin/env python3

"""
slide.py: Contains the Slide and Annotations classes.

Slide class: Wrapper around openslide.OpenSlide with annotation overlay and mask generation.
Annotations class: Parses annotation files from QuPath, ImageJ, and ASAP.
"""

import os
import glob
import json
import itertools
import operator as op
import xml.etree.ElementTree as ET
from typing import Any, Dict, List, Optional, Tuple, Union, Callable, Sequence
from itertools import chain

import numpy as np
import cv2
import openslide
from openslide import OpenSlide
import pandas as pd
from matplotlib.path import Path
import seaborn as sns

from pyslyde.util.utilities import mask2rgb


__author__ = 'Gregory Verghese'
__email__ = 'gregory.verghese@gmail.com'


[docs] class Slide(OpenSlide): """ Whole Slide Image (WSI) object that enables annotation overlay as a wrapper around openslide.OpenSlide. Generates annotation mask. Attributes: mag (int): Magnification level. dims (tuple): Dimensions of the WSI. name (str): Name of the slide file. draw_border (bool): Whether to generate border based on annotations. _border (list): List of border coordinates [(x1, y1), (x2, y2)]. """ MAG_FACTORS: Dict[int, int] = {0: 1, 1: 2, 2: 4, 3: 8, 4: 16, 5: 32} MASK_SIZE: Tuple[int, int] = (2000, 2000)
[docs] def __init__(self, filename: str, mag: int = 0, annotations: Optional['Annotations'] = None, annotations_path: Optional[Union[str, List[str]]] = None, labels: Optional[List[str]] = None, source: Optional[str] = None) -> None: super().__init__(filename) self.mag: int = mag self.dims: Tuple[int, int] = self.dimensions self.name: str = os.path.basename(filename) self._border: Optional[List[Tuple[int, int]]] = None self.annotations: Optional['Annotations'] = None if annotations is not None: self.annotations = annotations elif annotations_path is not None and source is not None: self.annotations = Annotations( annotations_path, source=source, labels=labels, encode=True )
@property def slide_mask(self) -> np.ndarray: """Get the slide mask as an RGB array.""" mask = self.generate_mask((Slide.MASK_SIZE)) mask = mask2rgb(mask) return mask
[docs] def generate_mask(self, size: Optional[Tuple[int, int]] = None, labels: Optional[List[Union[int, str]]] = None) -> np.ndarray: """ Generate a mask representation of annotations. Args: size (tuple, optional): Dimensions of the mask. labels (list, optional): List of labels to include in the mask. Returns: np.ndarray: Single-channel mask with integer for each class. """ x, y = self.dims[0], self.dims[1] slide_mask = np.zeros((y, x), dtype=np.uint8) if self.annotations is None: return slide_mask self.annotations.encode = True coordinates = self.annotations.annotations if coordinates is None: return slide_mask keys = sorted(list(coordinates.keys())) if labels: # Convert string labels to integer keys if needed label_keys = [] for l in labels: if isinstance(l, str) and l in self.annotations.class_key: label_keys.append(self.annotations.class_key[l]) elif isinstance(l, int): label_keys.append(l) labels = label_keys else: labels = keys for k in keys: if k in labels: v = coordinates[k] v = [np.array(a) for a in v] cv2.fillPoly(slide_mask, v, color=(int(k),)) if size is not None: slide_mask = cv2.resize(slide_mask, size) return slide_mask
[docs] @staticmethod def resize_border(dim: int, factor: int = 1, threshold: Optional[int] = None, operator: str = '=>') -> int: """ Resize and redraw annotation border. Useful to trim WSI and mask to a specific size. Args: dim (int): Dimension to resize. factor (int): Border increments. threshold (int, optional): Minimum/maximum size. operator (str): Threshold limit operator. Returns: int: New border dimension. """ if threshold is None: threshold = dim operator_dict: Dict[str, Callable] = {'>': op.gt, '=>': op.ge, '<': op.lt, '=<': op.lt} op_func = operator_dict[operator] multiples = [factor * i for i in range(100000)] multiples = [m for m in multiples if op_func(m, threshold)] diff = list(map(lambda x: abs(dim - x), multiples)) new_dim = multiples[diff.index(min(diff))] return new_dim
[docs] def get_border(self, space: int = 100) -> List[Tuple[int, int]]: """ Generate border around max/min annotation points. Args: space (int): Gap between max/min annotation point and border. Returns: list: Border dimensions [(x1, y1), (x2, y2)]. """ if self.annotations is None: self._border = [(0, self.dims[0]), (0, self.dims[1])] else: coordinates = self.annotations.annotations if coordinates is None: self._border = [(0, self.dims[0]), (0, self.dims[1])] else: coordinates = list(chain(*list(coordinates.values()))) coordinates = list(chain(*coordinates)) f = lambda x: (min(x) - space, max(x) + space) self._border = list(map(f, list(zip(*coordinates)))) mag_factor = Slide.MAG_FACTORS[self.mag] f = lambda x: (int(x[0] / mag_factor), int(x[1] / mag_factor)) self._border = list(map(f, self._border)) return self._border
[docs] def detect_components(self, level_dims: int = 6, num_component: Optional[int] = None, min_size: Optional[int] = None) -> Tuple[List[np.ndarray], List[List[Tuple[int, int]]]]: """ Find the largest section on the slide. Args: level_dims (int): Level of downsampling. num_component (int, optional): Number of components to keep. min_size (int, optional): Minimum size of component. Returns: tuple: (List of images with contours, list of border coordinates) """ new_dims = self.level_dimensions[6] image = np.array(self.get_thumbnail(self.level_dimensions[6])) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blur = cv2.bilateralFilter(np.bitwise_not(gray), 9, 100, 100) _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) if num_component is not None: idx = sorted([(cv2.contourArea(c), i) for i, c in enumerate(contours)]) contours = [contours[i] for c, i in idx] contours = contours[-num_component:] if min_size is not None: contours = list(map(lambda x: cv2.contourArea(x), contours)) contours = [c for c in contours if c > min_size] borders: List[List[Tuple[int, int]]] = [] components: List[np.ndarray] = [] image_new = image.copy() for c in contours: if isinstance(c, (list, np.ndarray)): x, y, w, h = cv2.boundingRect(np.array(c)) else: continue x_scale = self.dims[0] / new_dims[0] y_scale = self.dims[1] / new_dims[1] x1 = round(x_scale * x) x2 = round(x_scale * (x + w)) y1 = round(y_scale * y) y2 = round(y_scale * (y - h)) self._border = [(x1, x2), (y1, y2)] image_new = cv2.rectangle(image_new, (x, y), (x + w, y + h), (0, 255, 0), 2) components.append(image_new) borders.append([(x1, x2), (y1, y2)]) return components, borders
[docs] def generate_region(self, mag: int = 0, x: Optional[Union[int, Tuple[int, int]]] = None, y: Optional[Union[int, Tuple[int, int]]] = None, x_size: Optional[int] = None, y_size: Optional[int] = None, scale_border: bool = False, factor: int = 1, threshold: Optional[int] = None, operator: str = '=>') -> Tuple[np.ndarray, np.ndarray]: """ Extract a specific region of the slide. Args: mag (int): Magnification level. x (int or tuple, optional): Minimum x coordinate or (x_min, x_max). y (int or tuple, optional): Minimum y coordinate or (y_min, y_max). x_size (int, optional): Width of the region. y_size (int, optional): Height of the region. scale_border (bool): Whether to resize the border. factor (int): Factor for resizing. threshold (int, optional): Threshold for resizing. operator (str): Operator for threshold. Returns: tuple: (Extracted region as RGB ndarray, mask) """ x_min: int = 0 y_min: int = 0 x_max: int = 0 y_max: int = 0 if x is None: border = self.get_border() if border and len(border) >= 2: x, y = border[0], border[1] else: x, y = (0, self.dims[0]), (0, self.dims[1]) if x is not None: if isinstance(x, tuple): if x_size is None: x_min, x_max = x x_size = x_max - x_min elif x_size is not None: x_min = x[0] x_max = x_min + x_size elif isinstance(x, int): x_min = x x_max = x + (x_size or 0) if y is not None: if isinstance(y, tuple): if y_size is None: y_min, y_max = y y_size = y_max - y_min elif y_size is not None: y_min = y[0] y_max = y_min + y_size elif isinstance(y, int): y_min = y y_max = y + (y_size or 0) if scale_border and x_size is not None and y_size is not None: x_size = Slide.resize_border(x_size, factor, threshold, operator) y_size = Slide.resize_border(y_size, factor, threshold, operator) if x_size is not None and (x_min + x_size) > self.dimensions[0]: x_size = self.dimensions[0] - x_min if y_size is not None and (y_min + y_size) > self.dimensions[1]: y_size = self.dimensions[1] - y_min if x_size is None or y_size is None: raise ValueError("x_size and y_size must be specified") x_size_adj = int(x_size / Slide.MAG_FACTORS[mag]) y_size_adj = int(y_size / Slide.MAG_FACTORS[mag]) region = self.read_region((x_min, y_min), mag, (x_size_adj, y_size_adj)) mask = self.generate_mask()[y_min:y_min + y_size, x_min:x_min + x_size] return np.array(region.convert('RGB')), mask
[docs] def save(self, path: str, size: Tuple[int, int] = (2000, 2000), mask: bool = False) -> None: """ Save a thumbnail of the slide as an image file. Args: path (str): Path to save the image. size (tuple): Size of the thumbnail. mask (bool): Whether to save the mask instead of the image. """ if mask: cv2.imwrite(path, self.slide_mask) else: image = self.get_thumbnail(size) image = image.convert('RGB') image = np.array(image) cv2.imwrite(path, image)
[docs] class Annotations: """ Parses annotation files in XML or JSON format and returns a dictionary containing x, y coordinates for each region of interest (ROI). Args: path (str or list): Path(s) to annotation file(s). source (str): Annotation source type (e.g., 'qupath', 'imagej', 'asap'). labels (list, optional): List of ROI names. encode (bool): Whether to encode labels as integers. """
[docs] def __init__(self, path: Union[str, List[str]], source: str, labels: Optional[List[str]] = None, encode: bool = False) -> None: self.paths: List[str] = path if isinstance(path, list) else [path] self.source: str = source self.labels: Optional[List[str]] = labels self.encode: bool = encode self._annotations: Optional[Dict[Union[str, int], List[List[List[int]]]]] = None self._generate_annotations()
def __repr__(self) -> str: if self._annotations is None: return "Annotations(empty)" numbers = [len(v) for k, v in self._annotations.items()] df = pd.DataFrame({"classes": self.labels or [], "number": numbers}) return str(df) @property def keys(self) -> List[Union[str, int]]: if self._annotations is None: return [] return list(self._annotations.keys()) @property def values(self) -> List[List[List[List[int]]]]: if self._annotations is None: return [] return list(self._annotations.values()) @property def annotations(self) -> Optional[Dict[Union[str, int], List[List[List[int]]]]]: if self.encode: annotations = self.encode_keys() self.encode = False else: annotations = self._annotations return annotations @property def class_key(self) -> Dict[str, int]: if self.labels is None: self.labels = list(self._annotations.keys()) if self._annotations else [] class_key = {l: i + 1 for i, l in enumerate(self.labels)} return class_key @property def numbers(self) -> Dict[str, int]: if self._annotations is None: return {} numbers = [len(v) for k, v in self._annotations.items()] return dict(zip(self.labels or [], numbers)) def _generate_annotations(self) -> None: """ Call the appropriate method for the file type and generate annotations. """ self._annotations = {} if not isinstance(self.paths, list): self._paths = [self.paths] if self.source is not None: for p in self.paths: annotations = getattr(self, '_' + self.source)(p) for k, v in annotations.items(): if k in self._annotations: self._annotations[k].append(v) else: self._annotations[k] = v if len(self.labels or []) > 0: self._annotations = self.filter_labels(self.labels or []) else: self.labels = list(self._annotations.keys())
[docs] def filter_labels(self, labels: List[str]) -> Dict[Union[str, int], List[List[List[int]]]]: """ Remove labels from annotations. Args: labels (list): Label list to keep. Returns: dict: Filtered annotation dictionary. """ self.labels = labels if self._annotations is None: return {} keys = list(self._annotations.keys()) for k in keys: if k not in labels: del self._annotations[k] return self._annotations
[docs] def rename_labels(self, names: Dict[str, str]) -> None: """ Rename annotation labels. Args: names (dict): Mapping from current labels to new labels. """ if self._annotations is None: return for k, v in names.items(): self._annotations[v] = self._annotations.pop(k) self.labels = list(self._annotations.keys())
[docs] def encode_keys(self) -> Dict[int, List[List[List[int]]]]: """ Encode labels as integer values. Returns: dict: Annotations with integer keys. """ if self._annotations is None: return {} annotations = {self.class_key[k]: v for k, v in self._annotations.items()} return annotations
def _imagej(self, path: str) -> Dict[str, List[List[List[int]]]]: """ Parse ImageJ XML annotation files. Args: path (str): Path to the XML file. Returns: dict: Annotations dictionary. """ tree = ET.parse(path) root = tree.getroot() anns = root.findall('Annotation') labels = list(root.iter('Annotation')) labels = list(set([i.attrib['Name'] for i in labels])) annotations = {l: [] for l in labels} for i in anns: label = i.attrib['Name'] instances = list(i.iter('Vertices')) for j in instances: coordinates = list(j.iter('Vertex')) coordinates = [[c.attrib['X'], c.attrib['Y']] for c in coordinates] coordinates = [[round(float(c[0])), round(float(c[1]))] for c in coordinates] annotations[label] = annotations[label] + [coordinates] return annotations def _asap(self, path: str) -> Dict[str, List[List[List[int]]]]: """ Parse ASAP XML annotation files. Args: path (str): Path to the XML file. Returns: dict: Annotations dictionary. """ tree = ET.parse(path) root = tree.getroot() ns = root[0].findall('Annotation') labels = list(root.iter('Annotation')) labels = list(set([i.attrib['PartOfGroup'] for i in labels])) annotations = {l: [] for l in labels} for i in ns: coordinates = list(i.iter('Coordinate')) coordinates = [[float(c.attrib['X']), float(c.attrib['Y'])] for c in coordinates] coordinates = [[round(c[0]), round(c[1])] for c in coordinates] label = i.attrib['PartOfGroup'] annotations[label] = annotations[label] + [coordinates] return annotations def _qupath(self, path: str) -> Dict[str, List[List[List[int]]]]: """ Parse QuPath annotation JSON files. Args: path (str): Path to the JSON file. Returns: dict: Annotations dictionary. """ annotations: Dict[str, List[List[List[int]]]] = {} with open(path) as json_file: j = json.load(json_file) for a in j: c = a['properties']['classification']['name'] geometry = a['geometry']['type'] coordinates = a['geometry']['coordinates'] if c not in annotations: annotations[c] = [] if geometry == "LineString": points = [[int(i[0]), int(i[1])] for i in coordinates] annotations[c].append(points) elif geometry == "Polygon": for a2 in coordinates: points = [[int(i[0]), int(i[1])] for i in a2] annotations[c].append(points) elif geometry == "MultiPolygon": for a2 in coordinates: for a3 in a2: points = [[int(i[0]), int(i[1])] for i in a3] annotations[c].append(points) return annotations def _json(self, path: str) -> Dict[str, List[List[List[int]]]]: """ Parse JSON annotation files with a specific structure. Args: path (str): Path to the JSON file. Returns: dict: Annotations dictionary. """ with open(path) as json_file: json_annotations = json.load(json_file) labels = list(json_annotations.keys()) if self.labels is None: self.labels = [] self.labels.extend(labels) annotations = {k: [[[int(i['x']), int(i['y'])] for i in v2] for v2 in v.values()] for k, v in json_annotations.items()} return annotations def _dataframe(self, path: str) -> None: """ Parse a DataFrame with a specific structure. """ anns_df = pd.read_csv(path) anns_df.fillna('undefined', inplace=True) anns_df.set_index('labels', drop=True, inplace=True) self.labels = list(set(anns_df.index)) annotations: Dict[str, List[List[List[int]]]] = {} for l in self.labels: coords = list(zip(anns_df.loc[l].x, anns_df.loc[l].y)) annotations[l] = [[[int(x), int(y)] for x, y in coords]] self._annotations = annotations def _csv(self, path: str) -> Dict[str, List[List[List[int]]]]: """ Parse CSV annotation files with a specific structure. Args: path (str): Path to the CSV file. Returns: dict: Annotations dictionary. """ anns_df = pd.read_csv(path) anns_df.fillna('undefined', inplace=True) anns_df.set_index('labels', drop=True, inplace=True) labels = list(set(anns_df.index)) annotations: Dict[str, List[List[List[int]]]] = {} for l in labels: coords = list(zip(anns_df.loc[l].x, anns_df.loc[l].y)) annotations[l] = [[[int(x), int(y)] for x, y in coords]] self._annotations = annotations return annotations
[docs] def df(self) -> pd.DataFrame: """ Return a DataFrame of annotations. Returns: pd.DataFrame: DataFrame of annotations. """ if self._annotations is None: return pd.DataFrame() labels = [[l] * len(self._annotations[l][0]) for l in self._annotations.keys()] labels = list(chain(*labels)) x_values = [xi[0] for x in list(self._annotations.values()) for xi in x[0]] y_values = [yi[1] for y in list(self._annotations.values()) for yi in y[0]] df = pd.DataFrame({'labels': list(labels), 'x': x_values, 'y': y_values}) return df
[docs] def save(self, save_path: str) -> None: """ Save annotations as a CSV file. Args: save_path (str): Path to save the annotations. """ self.df().to_csv(save_path)