Skip to content

Class Documentation

Here you can find detailed documentation for the NodeCollectorProcessor class in both Python and the command line:

NodesCollectorProcessor Module

This module provides the NodesCollectorProcessor class for collecting and processing data for different types of nodes using the NodePropertiesExtractor and NodeDataProcessor classes. The collected data is intended for loading into a Neo4j graph database. The module supports command-line interface (CLI) usage for ease of use.

Classes:

Name Description
NodesCollectorProcessor

A class to collect and process data for different types of nodes.

Functions:

Name Description
main

Main function to parse command-line arguments and collect data for the specified node type and enzyme list.

NodesCollectorProcessor

A class to collect and process data for different types of nodes using NodePropertiesExtractor and NodeDataProcessor.

Source code in chemgraphbuilder/node_collector_processor.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class NodesCollectorProcessor:
    """
    A class to collect and process data for different types of nodes using NodePropertiesExtractor and NodeDataProcessor.
    """

    def __init__(self, node_type, enzyme_list, start_chunk=None):
        """
        Initializes the NodesCollectorProcessor with the node type to collect data for, and the list of enzymes.

        Args:
            node_type (str): The type of node to collect data for (e.g., 'Compound', 'BioAssay', 'Gene', 'Protein').
            enzyme_list (list of str): List of enzyme names for which assay data will be fetched from PubChem.
            start_chunk (int, optional): The starting chunk index for processing. Default is None.
        """
        self.node_type = node_type
        self.extractor = NodePropertiesExtractor(enzyme_list=enzyme_list)
        self.processor = NodeDataProcessor(data_dir="Data")
        self.start_chunk = start_chunk

    def collect_and_process_data(self):
        """
        Collects and processes data based on the node type and saves it to the appropriate file.
        """
        data_file = 'Data/AllDataConnected.csv'

        # Check if the data file exists before running the extractor
        if not os.path.exists(data_file):
            logging.info(f"{data_file} does not exist. Running data extraction...")
            df = self.extractor.run()
        else:
            logging.info(f"{data_file} already exists. Skipping main data extraction.")

        if self.node_type == 'Compound':
            self.extractor.extract_compound_properties(main_data='Data/AllDataConnected.csv', start_chunk=self.start_chunk)
            self.processor.preprocess_compounds()
        elif self.node_type == 'BioAssay':
            self.extractor.extract_assay_properties(main_data='Data/AllDataConnected.csv')
            self.processor.preprocess_assays()
        elif self.node_type == 'Gene':
            self.extractor.extract_gene_properties(main_data='Data/AllDataConnected.csv')
            self.processor.preprocess_genes()
        elif self.node_type == 'Protein':
            self.extractor.extract_protein_properties(main_data='Data/AllDataConnected.csv')
            self.processor.preprocess_proteins()
        else:
            logging.error(f"Unsupported node type: {self.node_type}")

__init__(node_type, enzyme_list, start_chunk=None)

Initializes the NodesCollectorProcessor with the node type to collect data for, and the list of enzymes.

Parameters:

Name Type Description Default
node_type str

The type of node to collect data for (e.g., 'Compound', 'BioAssay', 'Gene', 'Protein').

required
enzyme_list list of str

List of enzyme names for which assay data will be fetched from PubChem.

required
start_chunk int

The starting chunk index for processing. Default is None.

None
Source code in chemgraphbuilder/node_collector_processor.py
29
30
31
32
33
34
35
36
37
38
39
40
41
def __init__(self, node_type, enzyme_list, start_chunk=None):
    """
    Initializes the NodesCollectorProcessor with the node type to collect data for, and the list of enzymes.

    Args:
        node_type (str): The type of node to collect data for (e.g., 'Compound', 'BioAssay', 'Gene', 'Protein').
        enzyme_list (list of str): List of enzyme names for which assay data will be fetched from PubChem.
        start_chunk (int, optional): The starting chunk index for processing. Default is None.
    """
    self.node_type = node_type
    self.extractor = NodePropertiesExtractor(enzyme_list=enzyme_list)
    self.processor = NodeDataProcessor(data_dir="Data")
    self.start_chunk = start_chunk

collect_and_process_data()

Collects and processes data based on the node type and saves it to the appropriate file.

Source code in chemgraphbuilder/node_collector_processor.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def collect_and_process_data(self):
    """
    Collects and processes data based on the node type and saves it to the appropriate file.
    """
    data_file = 'Data/AllDataConnected.csv'

    # Check if the data file exists before running the extractor
    if not os.path.exists(data_file):
        logging.info(f"{data_file} does not exist. Running data extraction...")
        df = self.extractor.run()
    else:
        logging.info(f"{data_file} already exists. Skipping main data extraction.")

    if self.node_type == 'Compound':
        self.extractor.extract_compound_properties(main_data='Data/AllDataConnected.csv', start_chunk=self.start_chunk)
        self.processor.preprocess_compounds()
    elif self.node_type == 'BioAssay':
        self.extractor.extract_assay_properties(main_data='Data/AllDataConnected.csv')
        self.processor.preprocess_assays()
    elif self.node_type == 'Gene':
        self.extractor.extract_gene_properties(main_data='Data/AllDataConnected.csv')
        self.processor.preprocess_genes()
    elif self.node_type == 'Protein':
        self.extractor.extract_protein_properties(main_data='Data/AllDataConnected.csv')
        self.processor.preprocess_proteins()
    else:
        logging.error(f"Unsupported node type: {self.node_type}")

main()

Main function to parse command-line arguments and collect data for the specified node type and enzyme list.

Source code in chemgraphbuilder/node_collector_processor.py
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def main():
    """
    Main function to parse command-line arguments and collect data for the specified node type and enzyme list.
    """
    parser = argparse.ArgumentParser(description="Collect data for different types of nodes.")
    parser.add_argument('--node_type', type=str, required=True, choices=['Compound', 'BioAssay', 'Gene', 'Protein'], help='The type of node to collect data for')
    parser.add_argument('--enzyme_list', type=str, required=True, help='Comma-separated list of enzyme names to fetch data for')
    parser.add_argument('--start_chunk', type=int, default=None, help='The starting chunk index for processing Compound Data')

    args = parser.parse_args()
    enzyme_list = args.enzyme_list.split(',')

    collector = NodesCollectorProcessor(node_type=args.node_type, enzyme_list=enzyme_list, start_chunk=args.start_chunk)
    collector.collect_and_process_data()