Sort Numeric Columns in Excel using Formulas
=RANK(C5,sales)+COUNTIF($C$5:C5,C5)-1
where “sales” is the named range C5:C11
References
https://exceljet.net/formula/basic-numeric-sort-formula
=RANK(C5,sales)+COUNTIF($C$5:C5,C5)-1
where “sales” is the named range C5:C11
References
https://exceljet.net/formula/basic-numeric-sort-formula
len(gapminder['country'].unique().tolist())
set(df['region'].values.tolist())
# Create a list of unique values by turning the # pandas column into a set list(set(df.trucks))
# Create a list of unique values in df.trucks list(df['trucks'].unique())
# Import pandas package import pandas as pd # create a dictionary with five fields each data = { 'A':['A1', 'A2', 'A3', 'A4', 'A5'], 'B':['B1', 'B2', 'B3', 'B4', 'B4'], 'C':['C1', 'C2', 'C3', 'C3', 'C3'], 'D':['D1', 'D2', 'D2', 'D2', 'D2'], 'E':['E1', 'E1', 'E1', 'E1', 'E1'] } # Convert the dictionary into DataFrame df = pd.DataFrame(data) # Get the unique values of 'B' column df.B.unique()
# Import pandas package import pandas as pd # create a dictionary with five fields each data = { 'A':['A1', 'A2', 'A3', 'A4', 'A5'], 'B':['B1', 'B2', 'B3', 'B4', 'B4'], 'C':['C1', 'C2', 'C3', 'C3', 'C3'], 'D':['D1', 'D2', 'D2', 'D2', 'D2'], 'E':['E1', 'E1', 'E1', 'E1', 'E1'] } # Convert the dictionary into DataFrame df = pd.DataFrame(data) # Get number of unique values in column 'C' df.C.nunique(dropna = True)
References
https://pythonprogramming.net/graph-visualization-python3-pandas-data-analysis/
https://www.geeksforgeeks.org/get-unique-values-from-a-column-in-pandas-dataframe/
https://chrisalbon.com/python/data_wrangling/pandas_find_unique_values/
https://cmdlinetips.com/2018/01/how-to-get-unique-values-from-a-column-in-pandas-data-frame/
DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
Sort Dataframe rows based on a single column
# Sort the rows of dataframe by column 'Name' dfObj = dfObj.sort_values(by ='Name' ) print("Contents of Sorted Dataframe based on a single column 'Name' : ") print(dfObj)
Sort Dataframe rows based on a multiple columns
dfObj = dfObj.sort_values(by =['Name', 'Marks']) print("Contents of a Sorted Dataframe based on multiple columns 'Name' & 'Marks' : ") print(dfObj)
Sort Dataframe rows based on columns in Descending Order
# Sort the rows of dataframe by column 'Name' in descending order dfObj = dfObj.sort_values(by ='Name' , ascending=False) print("Contents of Sorted Dataframe based on a column 'Name' in Descending Order : ") print(dfObj)
Sort Dataframe rows based on a column in Place
# Sort the rows of dataframe by column 'Name' inplace dfObj.sort_values(by='Name' , inplace=True) print("Contents of Sorted Dataframe based on a single column 'Name' inplace: ") print(dfObj)
Sort columns of a Dataframe based on a single row
dfObj = dfObj.sort_values(by ='b', axis=1) print("Contents of Sorted Dataframe based on a single row index label 'b' ") print(dfObj)
Sort columns of a Dataframe in Descending Order based on a single row
dfObj = dfObj.sort_values(by ='b', axis=1) print("Contents of Sorted Dataframe based on a single row index label 'b' ") print(dfObj)
Sort columns of a Dataframe based on a multiple rows
dfObj = dfObj.sort_values(by ='b', axis=1) print("Contents of Sorted Dataframe based on a single row index label 'b' ") print(dfObj)
Complete example is as follows:
import pandas as pd def main(): # List of Tuples students = [ ('Jack', 34, 'Sydney') , ('Riti', 41, 'Delhi' ) , ('Aadi', 16, 'New York') , ('Riti', 22, 'Delhi' ) , ('Riti', 35, 'Delhi' ) , ('Riti', 40, 'Mumbai' ) ] # Create a DataFrame object dfObj = pd.DataFrame(students, columns=['Name', 'Marks', 'City'], index=['b', 'a', 'f', 'e', 'd', 'c']) print("Original Dataframe : ") print(dfObj) print('**** Sort Dataframe rows based on a single column ****') # Sort the rows of dataframe by column 'Name' dfObj = dfObj.sort_values(by ='Name' ) print("Contents of Sorted Dataframe based on a single column 'Name' : ") print(dfObj) print('**** Sort Dataframe rows based on a multiple columns ****') dfObj = dfObj.sort_values(by =['Name', 'Marks']) print("Contents of a Sorted Dataframe based on multiple columns 'Name' & 'Marks' : ") print(dfObj) print('**** Sort Dataframe rows based on a single column in Descending Order ****') # Sort the rows of dataframe by column 'Name' in descending order dfObj = dfObj.sort_values(by ='Name' , ascending=False) print("Contents of Sorted Dataframe based on a column 'Name' in Descending Order : ") print(dfObj) print('**** Sort Dataframe rows based on a single column in place ****') # Sort the rows of dataframe by column 'Name' inplace dfObj.sort_values(by='Name' , inplace=True) print("Contents of Sorted Dataframe based on a single column 'Name' inplace: ") print(dfObj) print('******** Sort columns of Dataframe based on a single or multiple rows ********') # List of Tuples matrix = [(222, 16, 23), (333, 31, 11), (444, 34, 11), ] # Create a DataFrame object of 3X3 Matrix dfObj = pd.DataFrame(matrix, index=list('abc')) print("Original Dataframe: ") print(dfObj) # Sort columns of a dataframe based on a single row with index label 'b' dfObj = dfObj.sort_values(by ='b', axis=1) print("Contents of Sorted Dataframe based on a single row index label 'b' ") print(dfObj) # Sort columns of a dataframe in descending order based on a single row with index label 'b' dfObj = dfObj.sort_values(by='b', axis=1, ascending=False) print("Contents of Sorted Dataframe in descending order based on a single row index label 'b' ") print(dfObj) # Sort columns of a dataframe based on a multiple row with index labels 'b' & 'c' dfObj = dfObj.sort_values(by =['b' , 'c' ], axis=1) print("Contents of Sorted Dataframe based on multiple rows index label 'b' & 'c' ") print(dfObj) if __name__ == '__main__': main()
References
https://thispointer.com/pandas-sort-rows-or-columns-in-dataframe-based-on-values-using-dataframe-sort_values/
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html
=VALUE("$1,000")
=VALUE(A2)
=VALUE(RIGHT(A2,3))
References
https://www.ablebits.com/office-addins-blog/2018/07/18/excel-convert-text-to-number/
https://exceljet.net/excel-functions/excel-value-function
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None, truncate_sheet=False, **to_excel_kwargs): """ Append a DataFrame [df] to existing Excel file [filename] into [sheet_name] Sheet. If [filename] doesn't exist, then this function will create it. Parameters: filename : File path or existing ExcelWriter (Example: '/path/to/file.xlsx') df : dataframe to save to workbook sheet_name : Name of sheet which will contain DataFrame. (default: 'Sheet1') startrow : upper left cell row to dump data frame. Per default (startrow=None) calculate the last row in the existing DF and write to the next row... truncate_sheet : truncate (remove and recreate) [sheet_name] before writing DataFrame to Excel file to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()` [can be dictionary] Returns: None """ from openpyxl import load_workbook import pandas as pd # ignore [engine] parameter if it was passed if 'engine' in to_excel_kwargs: to_excel_kwargs.pop('engine') writer = pd.ExcelWriter(filename, engine='openpyxl') # Python 2.x: define [FileNotFoundError] exception if it doesn't exist try: FileNotFoundError except NameError: FileNotFoundError = IOError try: # try to open an existing workbook writer.book = load_workbook(filename) # get the last row in the existing Excel sheet # if it was not specified explicitly if startrow is None and sheet_name in writer.book.sheetnames: startrow = writer.book[sheet_name].max_row # truncate sheet if truncate_sheet and sheet_name in writer.book.sheetnames: # index of [sheet_name] sheet idx = writer.book.sheetnames.index(sheet_name) # remove [sheet_name] writer.book.remove(writer.book.worksheets[idx]) # create an empty sheet [sheet_name] using old index writer.book.create_sheet(sheet_name, idx) # copy existing sheets writer.sheets = {ws.title:ws for ws in writer.book.worksheets} except FileNotFoundError: # file does not exist yet, we will create it pass if startrow is None: startrow = 0 # write out the new sheet df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs) # save the workbook writer.save()
Selecting pandas data using “iloc”
The iloc indexer for Pandas Dataframe is used for integer-location based indexing / selection by position.
# Single selections using iloc and DataFrame # Rows: data.iloc[0] # first row of data frame (Aleshia Tomkiewicz) - Note a Series data type output. data.iloc[1] # second row of data frame (Evan Zigomalas) data.iloc[-1] # last row of data frame (Mi Richan) # Columns: data.iloc[:,0] # first column of data frame (first_name) data.iloc[:,1] # second column of data frame (last_name) data.iloc[:,-1] # last column of data frame (id)
# Multiple row and column selections using iloc and DataFrame data.iloc[0:5] # first five rows of dataframe data.iloc[:, 0:2] # first two columns of data frame with all rows data.iloc[[0,3,6,24], [0,5,6]] # 1st, 4th, 7th, 25th row + 1st 6th 7th columns. data.iloc[0:5, 5:8] # first 5 rows and 5th, 6th, 7th columns of data frame (county -> phone1).
Selecting pandas data using “loc”
The Pandas loc indexer can be used with DataFrames for two different use cases:
a.) Selecting rows by label/index
b.) Selecting rows with a boolean / conditional lookup
# Select rows with index values 'Andrade' and 'Veness', with all columns between 'city' and 'email' data.loc[['Andrade', 'Veness'], 'city':'email'] # Select same rows, with just 'first_name', 'address' and 'city' columns data.loc['Andrade':'Veness', ['first_name', 'address', 'city']] # Change the index to be based on the 'id' column data.set_index('id', inplace=True) # select the row with 'id' = 487 data.loc[487]
# Select rows with first name Antonio, # and all columns between 'city' and 'email' data.loc[data['first_name'] == 'Antonio', 'city':'email'] # Select rows where the email column ends with 'hotmail.com', include all columns data.loc[data['email'].str.endswith("hotmail.com")] # Select rows with last_name equal to some values, all columns data.loc[data['first_name'].isin(['France', 'Tyisha', 'Eric'])] # Select rows with first name Antonio AND hotmail email addresses data.loc[data['email'].str.endswith("gmail.com") & (data['first_name'] == 'Antonio')] # select rows with id column between 100 and 200, and just return 'postal' and 'web' columns data.loc[(data['id'] > 100) & (data['id'] <= 200), ['postal', 'web']] # A lambda function that yields True/False values can also be used. # Select rows where the company name has 4 words in it. data.loc[data['company_name'].apply(lambda x: len(x.split(' ')) == 4)] # Selections can be achieved outside of the main .loc for clarity: # Form a separate variable with your selections: idx = data['company_name'].apply(lambda x: len(x.split(' ')) == 4) # Select only the True values in 'idx' and only the 3 columns specified: data.loc[idx, ['email', 'first_name', 'company']]
References
https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
https://thispointer.com/select-rows-columns-by-name-or-index-in-dataframe-using-loc-iloc-python-pandas/
The Excel ROW function returns the row number for a reference. For example, ROW(C5) returns 5, since C5 is the fifth row in the spreadsheet. When no reference is provided, ROW returns the row number of the cell which contains the formula.
=ROW(A3) // returns 3 =ROW(D5) // returns 5 =ROW(F10:J15) // returns 10
References
https://exceljet.net/excel-functions/excel-row-function
sudo nano /etc/apt/sources.list
replace
deb http://us.archive.ubuntu.com/ubuntu/ xenial main restricted
with
deb mirror://mirrors.ubuntu.com/mirrors.txt xenial main restricted
Manual apt mirror selection
wget -qO - mirrors.ubuntu.com/mirrors.txt
References
https://linuxconfig.org/how-to-select-the-fastest-apt-mirror-on-ubuntu-linux
Method #1: Using DataFrame.astype()
# importing pandas as pd import pandas as pd # sample dataframe df = pd.DataFrame({ 'A': [1, 2, 3, 4, 5], 'B': ['a', 'b', 'c', 'd', 'e'], 'C': [1.1, '1.0', '1.3', 2, 5] }) # converting all columns to string type df = df.astype(str) print(df.dtypes)
# importing pandas as pd import pandas as pd # sample dataframe df = pd.DataFrame({ 'A': [1, 2, 3, 4, 5], 'B': ['a', 'b', 'c', 'd', 'e'], 'C': [1.1, '1.0', '1.3', 2, 5] }) # using dictionary to convert specific columns convert_dict = {'A': int, 'C': float } df = df.astype(convert_dict) print(df.dtypes)
Method #2: Using DataFrame.apply()
We can pass pandas.to_numeric
, pandas.to_datetime
and pandas.to_timedelta
as argument to apply() function to change the datatype of one or more columns to numeric, datetime and timedelta respectively.
# importing pandas as pd import pandas as pd # sample dataframe df = pd.DataFrame({ 'A': [1, 2, 3, '4', '5'], 'B': ['a', 'b', 'c', 'd', 'e'], 'C': [1.1, '2.1', 3.0, '4.1', '5.1'] }) # using apply method df[['A', 'C']] = df[['A', 'C']].apply(pd.to_numeric) print(df.dtypes)
Method #3: Using DataFrame.infer_objects()
# importing pandas as pd import pandas as pd # sample dataframe df = pd.DataFrame({ 'A': [1, 2, 3, 4, 5], 'B': ['a', 'b', 'c', 'd', 'e'], 'C': [1.1, 2.1, 3.0, 4.1, 5.1] }, dtype ='object') # converting datatypes df = df.infer_objects() print(df.dtypes)
References
https://www.geeksforgeeks.org/change-data-type-for-one-or-more-columns-in-pandas-dataframe/
https://towardsdatascience.com/my-pandas-cheat-sheet-b71437ab26f
import pandas as pd df: pd.DataFrame = pd.read_csv("avocado.csv") df_cp = df.copy()
References
https://pythonprogramming.net/introduction-python3-pandas-data-analysis/