#!/usr/bin/python3
# ==================================================================
# FROM: www.youtube.com/watch?v=u4rsA5ZiTls
# This INCREDIBLE trick will speed up your data processes.
# ==================================================================
import pandas as pd
import numpy as np
csv_file = 'dataset.csv'
def get_dataset(size):
'''create fake dataset'''
df = pd.DataFrame()
df['size'] = np.random.choice(['big','medium','small'],size)
df['age'] = np.random.randint(1,50,size)
df['team'] = np.random.choice(['red','blue','yellow','green'],size)
df['win'] = np.random.choice(['yes','no'],size)
dates = pd.date_range('2020-01-01','2022-12-31')
df['date'] = np.random.choice(dates,size)
df['prob'] = np.random.uniform(0,1,size)
return df
def set_dtypes(df):
'''set dataset column data types'''
df['size'] = df['size'].astype('category')
df['age'] = df['age'].astype('int16')
df['team'] = df['team'].astype('category')
##df['win'] = df['win'].map({'yes':True,'no':False})
df['win'] = df['win'].astype('bool')
df['prob'] = df['prob'].astype('float16')
return df
# ------------------------------------------------------------------
# ---- create dataset (without set dtypes)
# ------------------------------------------------------------------
print()
print('='*50)
print('create dataset (without set dtypes) ' + '-'*14)
print('='*50)
print()
df = get_dataset(10_000)
print('create ds info ' + '-'*35)
df.info()
print('create ds head ' + '-'*36)
print(df.head())
# ---- CSV
print()
print('write csv ' + '-'*40)
print()
df.to_csv(csv_file)
##df.to_csv(csv_file,index=False)
##df.to_csv(csv_file,index=True)
## look at size of file
## command line: ls -GFlash test_csv.csv
print('read csv ' + '-'*41)
print()
df = pd.read_csv(csv_file)
##df = pd.read_csv(csv_file,index_col=[0])
print('read csv info ' + '-'*36)
df.info()
print('read csv head ' + '-'*36)
print(df.head())
# ------------------------------------------------------------------
# ---- create dataset (with set_dtypes)
# ------------------------------------------------------------------
print()
print('='*50)
print('create dataset (with set dtypes) ' + '-'*17)
print('='*50)
print()
df = get_dataset(10_000)
df = set_dtypes(df)
print('create ds info ' + '-'*35)
df.info()
print('create ds head ' + '-'*36)
print(df.head())
# ---- CSV
print()
print('write csv ' + '-'*40)
print()
df.to_csv(csv_file)
##df.to_csv(csv_file,index=False)
##df.to_csv(csv_file,index=True)
## look at size of file
## command line: ls -GFlash test_csv.csv
print('read csv ' + '-'*41)
df.info()
print('read csv head ' + '-'*36)
print(df.head())