"""Run TopoStats.
This provides an entry point for running TopoStats as a command line programme.
"""
from collections import defaultdict
from functools import partial
import importlib.resources as pkg_resources
import logging
from multiprocessing import Pool
from pprint import pformat
import sys
from pathlib import Path
import yaml
import pandas as pd
from tqdm import tqdm
from topostats.io import (
find_files,
read_yaml,
save_folder_grainstats,
write_yaml,
write_config_with_comments,
LoadScans,
)
from topostats.logs.logs import LOGGER_NAME
from topostats.plotting import toposum
from topostats.processing import check_run_steps, completion_message, process_scan
from topostats.utils import update_config, update_plotting_config
from topostats.validation import validate_config, DEFAULT_CONFIG_SCHEMA, PLOTTING_SCHEMA, SUMMARY_SCHEMA
# We already setup the logger in __init__.py and it is idempotent so calling it here returns the same object as from
# __init__.py
# Ref : https://stackoverflow.com/a/57799639/1444043
# LOGGER = setup_logger(LOGGER_NAME)
LOGGER = logging.getLogger(LOGGER_NAME)
# pylint: disable=too-many-branches
# pylint: disable=too-many-locals
# pylint: disable=too-many-statements
# pylint: disable=unnecessary-dict-index-lookup
# pylint: disable=too-many-nested-blocks
[docs]
def run_topostats(args=None): # noqa: C901
"""Find and process all files."""
# Parse command line options, load config (or default) and update with command line options
if args.config_file is not None:
config = read_yaml(args.config_file)
else:
default_config = pkg_resources.open_text(__package__, "default_config.yaml").read()
config = yaml.safe_load(default_config)
# Override the config with command line arguments passed in, eg --output_dir ./output/
config = update_config(config, args)
# Set logging level
if config["log_level"] == "warning":
LOGGER.setLevel("WARNING")
elif config["log_level"] == "error":
LOGGER.setLevel("ERROR")
elif config["log_level"] == "debug":
LOGGER.setLevel("DEBUG")
else:
LOGGER.setLevel("INFO")
# Validate configuration
validate_config(config, schema=DEFAULT_CONFIG_SCHEMA, config_type="YAML configuration file")
# Write sample configuration if asked to do so and exit
if args.create_config_file and args.config_file:
raise ValueError("--create-config-file and --config cannot be used together.")
if args.create_config_file:
write_config_with_comments(config=default_config, output_dir=Path.cwd(), filename=args.create_config_file)
sys.exit()
# Create base output directory
config["output_dir"].mkdir(parents=True, exist_ok=True)
# Load plotting_dictionary and validate
plotting_dictionary = pkg_resources.open_text(__package__, "plotting_dictionary.yaml")
config["plotting"]["plot_dict"] = yaml.safe_load(plotting_dictionary.read())
validate_config(
config["plotting"]["plot_dict"], schema=PLOTTING_SCHEMA, config_type="YAML plotting configuration file"
)
# Check earlier stages of processing are enabled for later.
check_run_steps(
filter_run=config["filter"]["run"],
grains_run=config["grains"]["run"],
grainstats_run=config["grainstats"]["run"],
dnatracing_run=config["dnatracing"]["run"],
)
# Update the config["plotting"]["plot_dict"] with plotting options
config["plotting"] = update_plotting_config(config["plotting"])
LOGGER.info(f"Configuration file loaded from : {args.config_file}")
LOGGER.info(f"Scanning for images in : {config['base_dir']}")
LOGGER.info(f"Output directory : {str(config['output_dir'])}")
LOGGER.info(f"Looking for images with extension : {config['file_ext']}")
img_files = find_files(config["base_dir"], file_ext=config["file_ext"])
LOGGER.info(f"Images with extension {config['file_ext']} in {config['base_dir']} : {len(img_files)}")
if len(img_files) == 0:
LOGGER.error(f"No images with extension {config['file_ext']} in {config['base_dir']}")
LOGGER.error("Please check your configuration and directories.")
sys.exit()
LOGGER.info(f'Thresholding method (Filtering) : {config["filter"]["threshold_method"]}')
LOGGER.info(f'Thresholding method (Grains) : {config["grains"]["threshold_method"]}')
LOGGER.debug(f"Configuration after update : \n{pformat(config, indent=4)}") # noqa : T203
processing_function = partial(
process_scan,
base_dir=config["base_dir"],
filter_config=config["filter"],
grains_config=config["grains"],
grainstats_config=config["grainstats"],
dnatracing_config=config["dnatracing"],
plotting_config=config["plotting"],
output_dir=config["output_dir"],
)
all_scan_data = LoadScans(img_files, **config["loading"])
all_scan_data.get_data()
# Get a dictionary of all the image data dictionaries.
# Keys are the image names
# Values are the individual image data dictionaries
scan_data_dict = all_scan_data.img_dict
with Pool(processes=config["cores"]) as pool:
results = defaultdict()
image_stats_all = defaultdict()
with tqdm(
total=len(img_files),
desc=f"Processing images from {config['base_dir']}, results are under {config['output_dir']}",
) as pbar:
for img, result, individual_image_stats_df in pool.imap_unordered(
processing_function,
scan_data_dict.values(),
):
results[str(img)] = result
pbar.update()
# Add the dataframe to the results dict
image_stats_all[str(img)] = individual_image_stats_df
# Display completion message for the image
LOGGER.info(f"[{img.name}] Processing completed.")
LOGGER.info(f"Saving image stats to : {config['output_dir']}/image_stats.csv.")
# Concatenate all the dictionary's values into a dataframe. Ignore the keys since
# the dataframes have the file names in them already.
image_stats_all_df = pd.concat(image_stats_all.values())
image_stats_all_df.to_csv(config["output_dir"] / "image_stats.csv")
try:
results = pd.concat(results.values())
except ValueError as error:
LOGGER.error("No grains found in any images, consider adjusting your thresholds.")
LOGGER.error(error)
# Summary Statistics and Plots
if config["summary_stats"]["run"]:
# Load summary plots/statistics configuration and validate, location depends on command line args or value in
# any config file given, if neither are provided the default topostats/summary_config.yaml is loaded
if args.summary_config is not None:
summary_config = read_yaml(args.summary_config)
elif config["summary_stats"]["config"] is not None:
summary_config = read_yaml(config["summary_stats"]["config"])
else:
summary_yaml = pkg_resources.open_text(__package__, "summary_config.yaml")
summary_config = yaml.safe_load(summary_yaml.read())
# Do not pass command line arguments to toposum as they clash with process command line arguments
summary_config = update_config(summary_config, {})
validate_config(summary_config, SUMMARY_SCHEMA, config_type="YAML summarisation config")
# We never want to load data from CSV as we are using the data that has just been processed.
summary_config.pop("csv_file")
# Load variable to label mapping
plotting_yaml = pkg_resources.open_text(__package__, "var_to_label.yaml")
summary_config["var_to_label"] = yaml.safe_load(plotting_yaml.read())
LOGGER.info("[plotting] Default variable to labels mapping loaded.")
# If we don't have a dataframe or we do and it is all NaN there is nothing to plot
if isinstance(results, pd.DataFrame) and not results.isna().values.all():
if results.shape[0] > 1:
# If summary_config["output_dir"] does not match or is not a sub-dir of config["output_dir"] it
# needs creating
summary_config["output_dir"] = config["output_dir"] / "summary_distributions"
summary_config["output_dir"].mkdir(parents=True, exist_ok=True)
LOGGER.info(f"Summary plots and statistics will be saved to : {summary_config['output_dir']}")
# Plot summaries
summary_config["df"] = results.reset_index()
toposum(summary_config)
else:
LOGGER.warning(
"There are fewer than two grains that have been detected, so"
" summary plots cannot be made for this image."
)
else:
LOGGER.warning(
"There are no results to plot, either...\n\n"
"* you have disabled grains/grainstats/dnatracing.\n"
"* no grains have been detected across all scans.\n"
"* there have been errors.\n\n"
"If you are not expecting to detect grains please consider disabling"
"grains/grainstats/dnatracing/plotting/summary_stats. If you are expecting to detect grains"
" please check log-files for further information."
)
else:
summary_config = None
# Write statistics to CSV if there is data.
if isinstance(results, pd.DataFrame) and not results.isna().values.all():
results.reset_index(inplace=True)
results.set_index(["image", "threshold", "molecule_number"], inplace=True)
results.to_csv(config["output_dir"] / "all_statistics.csv", index=True)
save_folder_grainstats(config["output_dir"], config["base_dir"], results)
results.reset_index(inplace=True) # So we can access unique image names
images_processed = len(results["image"].unique())
else:
images_processed = 0
LOGGER.warning("There are no grainstats or dnatracing statistics to write to CSV.")
# Write config to file
config["plotting"].pop("plot_dict")
write_yaml(config, output_dir=config["output_dir"])
LOGGER.debug(f"Images processed : {images_processed}")
completion_message(config, img_files, summary_config, images_processed)