Skip to content

Clean Data Module

Clean Data.

Script to clean data from dataset and create new excel sheet with data.

Can be ran from command line.

    python3 clean_data.py {input_file} {output_file}

Or import clean_data function.

    from clean_data import clean_data

This module contains the following functions: - create_data_frame - clean_data - remove_duplicates - main

clean_data(data_frame)

Clean data.

Takes data_frame object and cleans data by using 'applymap' to apply a lamda function to all DataFrame values.

1st Lambda function converts values to strings, replaces instances of ND with empty string, and strips whitespace and quotation marks.

2nd Lambda function converts values that should be numeric back to numeric values.

Parameters:

Name Type Description Default
data_frame pandas.DataFrame

Pandas DataFrame object

required

Returns:

Type Description
pandas.DataFrame

clean_data_frame (pandas.DataFrame) Pandas DataFrame object

Source code in report_generator/excel_extraction/clean_data.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def clean_data(data_frame: pandas.DataFrame) -> pandas.DataFrame:
    """## Clean data.

    Takes data_frame object and cleans data by using 'applymap' to apply a lamda
    function to all DataFrame values.

    1st Lambda function converts values to strings, replaces instances of ND with
    empty string, and strips whitespace and quotation marks.

    2nd Lambda function converts values that should be numeric back to numeric
    values.

    Args:
        data_frame (pandas.DataFrame): Pandas DataFrame object

    Returns:
        clean_data_frame (pandas.DataFrame) Pandas DataFrame object

    """
    clean_data_frame = data_frame.applymap(
        lambda x: re.sub("ND", "", str(x)).strip().strip('"'), na_action="ignore"
    )
    clean_data_frame = clean_data_frame.applymap(
        lambda x: pandas.to_numeric(x, errors="coerce")
        if str(x).replace(".", "").isdigit()
        else x
    )
    clean_data_frame = clean_data_frame.applymap(lambda x: None if x == "" else x)
    clean_data_frame = remove_duplicates(clean_data_frame)

    return clean_data_frame

create_data_frame(path_to_dataset)

Create data frame.

Loads excel file and returns Pandas DataFrame obj

Parameters:

Name Type Description Default
path_to_data_set str

file path string

required

Returns:

Name Type Description
data_frame pandas.DataFrame

Cleaned Pandas DataFrame

Source code in report_generator/excel_extraction/clean_data.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def create_data_frame(path_to_dataset: str) -> pandas.DataFrame:
    """## Create data frame.

    Loads excel file and returns Pandas DataFrame obj

    Args:
        path_to_data_set (str): file path string

    Returns:
        data_frame (pandas.DataFrame): Cleaned Pandas DataFrame

    """
    logger.info("Clean Data Started")

    data_frame = None
    try:
        data_frame = pandas.read_excel(path_to_dataset)

    except FileNotFoundError as e:
        logger.error(f"Failed to open excel file: {e}")

    logger.info("Clean Data Ended")

    return data_frame

main(input_file_name, output_file_name)

Clean data main.

Takes input_file_name and output_file_name. Loads data from input_file Cleans data Saves as output_file

Parameters:

Name Type Description Default
input_file_name str

The input file name string

required
output_file_name str

The output file name string

required
Source code in report_generator/excel_extraction/clean_data.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def main(input_file_name: str, output_file_name: str) -> None:
    """## Clean data main.

    Takes input_file_name and output_file_name.
    Loads data from input_file
    Cleans data
    Saves as output_file

    Args:
        input_file_name (str): The input file name string
        output_file_name (str): The output file name string

    """
    # Get filepaths
    cur_dir = os.getcwd()
    path_to_dataset = os.path.join(cur_dir, input_file_name)
    path_to_output_file = os.path.join(cur_dir, output_file_name)

    # load df
    try:
        data_frame = create_data_frame(path_to_dataset)

        # clean df
        clean_data_frame = clean_data(data_frame)

        # output to new file
        clean_data_frame.to_excel(path_to_output_file, index=False)

    except FileExistsError as e:
        print(e)

remove_duplicates(data_frame)

Remove duplicates from data.

Takes a data_frame object and looks for duplicate name combination entries and removes the row from the dataset and puts it into a separate file.

Parameters:

Name Type Description Default
data_frame pandas.DataFrame

Pandas Dataframe object

required

Returns clear_data_frame (pandas.DataFrame): Cleaned Dataframe for duplicates

Source code in report_generator/excel_extraction/clean_data.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def remove_duplicates(data_frame: pandas.DataFrame) -> pandas.DataFrame:
    """## Remove duplicates from data.

    Takes a data_frame object and looks for duplicate name combination entries
    and removes the row from the dataset and puts it into a separate file.

    Args:
        data_frame (pandas.DataFrame): Pandas Dataframe object

    Returns
        clear_data_frame (pandas.DataFrame): Cleaned Dataframe for duplicates
    """
    logger.info(f"Searching Duplicates: Current Row Count={len(data_frame.index)}")
    dups = data_frame[
        data_frame.duplicated(["Order", "Family", "Genus", "Species"], keep=False)
    ]

    settings = report_generator.config.load_config()
    dir_path = settings["dir_path"]
    duplicate_path = os.path.join(dir_path, "data", "duplicates", "duplicates.xlsx")
    logger.info(f"Saving duplicates to duplicates file: {duplicate_path}")
    dups.to_excel(duplicate_path)

    logger.debug("Removing Duplicates")

    dups = data_frame[
        data_frame.duplicated(["Order", "Family", "Genus", "Species"], keep="first")
    ]
    data_frame = data_frame.drop_duplicates(
        subset=["Order", "Family", "Genus", "Species"], keep="first"
    )
    logger.info(f"Removed Duplicates: Current Row Count={len(data_frame.index)}")
    return data_frame