Add alignments to Python PCA API #2285

akenmorris · 2024-07-14T15:26:48Z

The Python API for performing PCA does not allow parameters to specify the alignment strategy in the case of multiple domains. This should be added.

As an example, the following script does so outside of the ShapeWorks API

#!/usr/bin/env python3

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import fire
import os
import sys


def read_xlsx_file(filepath):
    print(f"Reading file: {filepath}")
    data = pd.read_excel(filepath)
    return data


def save_to_csv(data, filename):
    df = pd.DataFrame(data, columns=[f'PC{i+1}' for i in range(data.shape[1])])
    df.to_csv(filename, index=False)


def python_pca(file, output):
    data = read_xlsx_file(file)

    # find the column names that start with 'world_particles_'
    particle_columns = [
        col for col in data.columns if col.startswith('world_particles_')]

    local_particle_columns = [
        col for col in data.columns if col.startswith('local_particles_')]

    alignment_columns = [
        col for col in data.columns if col.startswith('alignment_')]

    procrustes_columns = [
        col for col in data.columns if col.startswith('procrustes_')]

    num_samples = len(data)

    num_domains = len(particle_columns)
    domain_names = [col.split('_')[-1] for col in particle_columns]

    alignment_domains = [col.split('_')[-1] for col in alignment_columns]
    # append "local"
    alignment_domains.append("local")
    for domain in alignment_domains:

        # for each row in the data, read the files in the columns, skip the header
        values = []
        for i, row in data.iterrows():

            # create 4x4 identity matrix
            transform = np.eye(4)

            if domain != "local":
                # read procrustes matrix from column
                transform_values = row[f'alignment_{domain}'].split()
                transform = np.array(transform_values).reshape(
                    4, 4).astype(float)

            row_values = []
            for col in particle_columns:

                if domain != "local":
                    # replace "world" with "local" in column name
                    col = col.replace("world", "local")

                domain_name = col.split('_')[-1]

                procrustes_domain = domain

                if domain == "global" or domain == "local":
                    procrustes_domain = domain_name

                procrustes_scaling_values = np.array(row[f'procrustes_{domain_name}'].split()).reshape(4, 4).astype(float)
                # extract scaling from this 4x4
                scaling = np.sqrt(np.sum(procrustes_scaling_values[:3, :3] ** 2, axis=0))
                
                procrustes_values = row[f'procrustes_{procrustes_domain}'].split()
                procrustes = np.array(procrustes_values).reshape(
                    4, 4).astype(float)
                target_scaling = np.sqrt(np.sum(procrustes[:3, :3] ** 2, axis=0))
                # remove scaling from the target procrustes
                procrustes[:3, :3] /= target_scaling

                filename = row[col]
                with open(filename, 'r') as f:
                    particles = np.array(
                        [list(map(float, line.strip().split())) for line in f])

                    # add a column of ones
                    particles = np.hstack(
                        (particles, np.ones((particles.shape[0], 1))))

                    if domain != "local":
                        
                        # transform each x,y,z by alignment
                        particles = np.dot(particles, transform.T)

                        # apply scaling from local procrustes
                        particles[:, :3] *= scaling
                        
                        if domain != "global":
                            # apply procrustes of target alignment domain
                            particles = np.dot(particles, procrustes.T)

                    row_values.extend(particles.flatten())

            values.append(row_values)

        # reconfigure values as a NxM array
        values = np.array(values)

        # recenter the data
        values -= np.mean(values, axis=0)

        # perform PCA
        pca = PCA(n_components=num_samples)
        pca_scores = pca.fit_transform(values)

        # write values to csv
        save_to_csv(pca_scores, output + f"_{domain}.csv")


if __name__ == "__main__":
    fire.Fire(python_pca)

akenmorris added the Enhancement label Jul 14, 2024

akenmorris self-assigned this Jul 14, 2024

akenmorris added the Python API label Jul 14, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add alignments to Python PCA API #2285

Add alignments to Python PCA API #2285

akenmorris commented Jul 14, 2024 •

edited

Loading

Add alignments to Python PCA API #2285

Add alignments to Python PCA API #2285

Comments

akenmorris commented Jul 14, 2024 • edited Loading

akenmorris commented Jul 14, 2024 •

edited

Loading