df.to_csv(filename) # Writes to a CSV file
df.to_excel(filename) # Writes to an Excel file
df.to_sql(table_name, connection_object) # Writes to a SQL table
df.to_json(filename) # Writes to a file in JSON format
df.to_html(filename) # Saves as an HTML table
df.to_clipboard() # Writes to the clipboard
Defining, then Coding and Testing Immediately
In reality, it is often more practical to define a cleaning operation, then immediately code and test it. The data wrangling template still applies here, except you'll have multiple Define, Code, and Testsubheadings, with third level headers (###) denoting each issue, as displayed below.
Saving and writing
Remember, set index=False to avoid saving with an unnamed column!
Selecting
df[col] # Returns column with label col as Series
df[[col1, col2]] # Returns Columns as a new DataFrame
s.iloc[0] # Selection by position (selects first element)
s.loc[0] # Selection by index (selects element at index 0)
df.iloc[0,:] # First row
df.iloc[0,0] # First element of first column
We can select data using loc and iloc, which you can read more about here. loc uses labels of rows or columns to select data, while iloc uses the index numbers. We'll use these to index the dataframe below.
# select all the columns from 'id' to the last mean column which is called
# fractal_dimension_mean (there are other cols afterthat)
df_means = df.loc[:,'id':'fractal_dimension_mean']
df_means.head()
# Using index numbers
df_means = df.iloc[:,:12]
Another useful function that we’re going to use is pandas' query function.
In the previous lesson, we selected rows in a dataframe by indexing with a mask. Here are those same examples, along with equivalent statements that use query().
# selecting malignant records in cancer data
df_m = df[df['diagnosis'] == 'M']
df_m = df.query('diagnosis == "M"')
# selecting records of people making over $50K
df_a = df[df['income'] == ' >50K']
df_a = df.query('income == " >50K"')
The examples above filtered columns containing strings. You can also use query to filter columns containing numeric data like this.
# selecting records in cancer data with radius greater than the median
df_h = df[df['radius'] > 13.375]
df_h = df.query('radius > 13.375')
Groupby
Describing
df.shape() # Prints number of rows and columns in dataframe
df.head(n) # Prints first n rows of the DataFrame
df.tail(n) # Prints last n rows of the DataFrame
df.info() # Index, Datatype and Memory information
df.describe() # Summary statistics for numerical columns
s.value_counts(dropna=False) # Views unique values and counts
df.apply(pd.Series.value_counts) # Unique values and counts for all columns
df.describe() # Summary statistics for numerical columns
df.mean() # Returns the mean of all columns
df.corr() # Returns the correlation between columns in a DataFrame
df.count() # Returns the number of non-null values in each DataFrame column
df.max() # Returns the highest value in each column
df.min() # Returns the lowest value in each column
df.median() # Returns the median of each column
df.std() # Returns the standard deviation of each column
Useful exploratory methods:
df_red.duplicated().sum()
df_red.isnull().sum()
Cleaning
# Data CleaningPython
df.columns = ['a','b','c'] # Renames columns
pd.isnull() # Checks for null Values, Returns Boolean Array
pd.notnull() # Opposite of s.isnull()
df.dropna() # Drops all rows that contain null values
df.dropna(axis=1) # Drops all columns that contain null values
df.dropna(axis=1,thresh=n) # Drops all rows have have less than n non null values
df.fillna(x) # Replaces all null values with x
s.fillna(s.mean()) # Replaces all null values with the mean (mean can be replaced with almost any function from the statistics section)
s.astype(float) # Converts the datatype of the series to float
s.replace(1,'one') # Replaces all values equal to 1 with 'one'
s.replace([1,3],['one','three']) # Replaces all 1 with 'one' and 3 with 'three'
df.rename(columns=lambda x: x + 1) # Mass renaming of columns
df.rename(columns = lambda x: x[:10] + "_2008", inplace=True) # Other example
df.rename(columns={'old_name': 'new_ name'}, inplace=True) # Selective renaming
df.set_index('column_one') # Changes the index
df.rename(index=lambda x: x + 1) # Mass renaming of index
pd.to_datetime
df.drop_duplicates(inplace=True)
# Example: drop columns from 2008 dataset
df_08.drop(['Stnd', 'Underhood ID', 'FE Calc Appr', 'Unadj Cmb MPG'], axis=1, inplace=True)
# Example: replace spaces with underscores and lowercase labels for 2008 dataset
# The strip() returns a copy of the string with both leading and trailing characters stripped.
df_08.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
# Check to see if different values are not in our columns:
for phrase in asap_list:
assert phrase not in df_clean.StartDate.values
# confirm column labels for 2008 and 2018 datasets are identical
df_08.columns == df_18.columns
# make sure they're all identical like this
(df_08.columns == df_18.columns).all()
# return duplicated rows
df2[df2.duplicated(['user_id'], keep=False)]
Fixing data types
# convert 2018 cyl column to int
df_18['cyl'] = df_18['cyl'].astype(int)
Regular expressions
# Extract int from strings in the 2008 cyl column
df_08['cyl'] = df_08['cyl'].str.extract('(\d+)').astype(int)
Renaming Columns
The following helps us determine duplicated columns in different tables
Helpful script
# remove "_mean" from column names
new_labels = []
for col in df.columns:
if '_mean' in col:
new_labels.append(col[:-5]) # exclude last 6 characters
else:
new_labels.append(col)
# new labels for our columns
new_labels
Tidiness
To e deleted
for tweet in tweet_id[0:9]:
tweet = api.get_status(tweet, tweet_mode='extended')
with open('tweet_json.txt') as f:
for line in f:
status = json.loads(line)
tweet_id = # get the status ID
retweet_count = # get retweet count
favorite_count = # get favorite count
df = pd.DataFrame([[tweet_id, retweet_count, favorite_count]],
columns=['tweet_id', 'retweet_count', 'favorite_count'])
df_3 = df_3.append(df)