Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lr svm k means #2

Merged
merged 5 commits into from
Aug 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ __pycache__

# ADD ANY OTHER DIRECTORIES/FILES HERE
*Report*
*imputed*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Features that didn't seem as necessary included the device's 'launch status', co

#### Performance of sklearn's algorithms

As shown in the results of the Random Forest classifier by sklearn, we have a very high accuracy of 71%. The relationship seems to be either quite complex or perhaps something was missed. The question of whether the data is in the right form is also raised. The much poorer performance of the Multiple Layer Perceptron at 55% accuracy is also concerning.
As shown in the results of the Random Forest classifier by sklearn, we have an accuracy of 80%. The relationship seems to be either quite complex or perhaps something was missed. The question of whether the data is in the right form is also raised. The much poorer performance of the Multiple Layer Perceptron at 10% accuracy is also concerning.

The question now is: What really is the relationship between a mobile device's technical specifications and its price? Is it linear, exponential or perhaps much more complex?

Expand Down
4 changes: 2 additions & 2 deletions ml_algorithms/LR.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,9 @@
"\n",
"y = df[\"misc_price\"]\n",
"X = df.drop([\"key_index\", \"misc_price\"], axis=1)\n",
"X = X.drop(['launch_status', 'comms_wlan', 'comms_usb', 'platform_os', 'core_count'], axis=1)\n",
"# X = X.drop(['launch_status', 'comms_wlan', 'comms_usb', 'platform_os', 'core_count'], axis=1)\n",
"\n",
"df_lr = df.drop(['key_index','launch_status', 'comms_wlan', 'comms_usb', 'platform_os', 'core_count'], axis=1)\n",
"# df_lr = df.drop(['key_index','launch_status', 'comms_wlan', 'comms_usb', 'platform_os', 'core_count'], axis=1)\n",
"\n",
"# Train & test split. Seed = 120 for reproducing same shuffling of indices.\n",
"# Note 70-30 split for the preliminary split.\n",
Expand Down
173 changes: 145 additions & 28 deletions ml_algorithms/auxiliary/data_clean2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@

from .data_interpolate import *

# TODO: change this such that cameras are '0' if we cannot regex them.
# TODO: Other features like 'body_weight' and 'body_sim' could be included.


def extract_straight(df):
for feature in straight_features:
s = df[feature]
Expand Down Expand Up @@ -85,25 +87,38 @@ def sensor(string):
sensors = ['accelerometer', 'proximity',
'compass', 'gyro', 'fingerprint', 'barometer']

return len([ sensor for sensor in sensors if (sensor in string.lower()) ])
return len([sensor for sensor in sensors if (sensor in string.lower())])


# TODO: fix this. This should check whether '4K' or '2K' appear first before '1080p' etc.
# Return the value '1080', '2160' if found, with no 'p'
def cam_vid(string):
string = str(string)
if not string or type(string)==float:
return str(0)

p_string = re.search(r"\d+p", string)
k_string = ""

if not p_string:
k_string = re.search(r"\d+", string.lower())
if not k_string:
return string
k_string = re.search(r"\d+k", string.lower())
if k_string:
if '2' in k_string.group(0):
return '1080p'
return '1080'
if '4' in k_string.group(0):
return '2160p'
return '2160'

p_string = re.search(r"\d+(?=p)", string)

return string
if p_string:
return p_string.group(0)

return None


# The pattern is 'x MP'. Do not accept any other pattern.
def cam_snap(string):
if not string or type(string)==float:
return str(0)

mp_string = re.search(r"\d+\.{0,1}\d+(?=mp)", string.lower().replace(" ", ""))

return mp_string.group(0) if mp_string else None


def os(string):
Expand Down Expand Up @@ -136,11 +151,11 @@ def extract_screen_in(df):
"""
# Regex for screen size in inches
df['screen_size'] = df['display_size'].apply(
lambda x: re.search(r'^.*(?=( inches))', str(x).lower()) )
lambda x: re.search(r'^.*(?=( inches))', str(x).lower()))

# Regex for screen-body ratio
df['scn_bdy_ratio'] = df['display_size'].apply(
lambda x: re.search(r'\d{1,2}.\d(?=%)', str(x).lower()) )
lambda x: re.search(r'\d{1,2}.\d(?=%)', str(x).lower()))

# Apply results, NOTE: pandas doesn't like it when we're applying to multiple series.
results1 = df['scn_bdy_ratio'].apply(lambda y: y.group(0) if y else None)
Expand All @@ -164,6 +179,97 @@ def core_count(string):
return str(count)


# NOTE accept MB or GB only.
def extract_rom_ram(df):
"""
Split memory internal to 'ram' and 'rom'.
There is some boiler-plate code in the get_ram/rom functions & someone can abstract it if they want to.
"""
# Get ROM in MB
df['rom'] = df['memory_internal'].apply(get_rom)

# Get RAM in MB
df['ram'] = df['memory_internal'].apply(get_ram)

return df.drop(['memory_internal'], axis=1)


# Return the ram in MB
def get_ram(string):
# float means that string is 'NaN'
if type(string)==float:
return None

# print("string is,", string)

# get rid of spaces
string = string.replace(" ", "")

if "RAM" in string:
x = re.search(r'\d+(G|M)B(?=RAM)', string)
if x:
x = x.group(0)
# print("x is", x)

if "GB" in x:
# get the word before GB
y = re.search(r'\w+(?=GB)', x).group(0)
return str(float(y)*1000)
if "MB" in x:
y = re.search(r'\w+(?=MB)', x).group(0)
return str(y)

print(f"Something weird happened with {string}")
return string

return str(0)


# Return the rom in MB
def get_rom(string):
# float means that string is 'NaN'
if type(string)==float:
return None

# print("string is,", string)

# get rid of spaces
string = string.replace(" ", "")

if "ROM" in string:
# TODO: error - > "nothing to repeat at position 14?"
x = re.search(r'\d+(G|M)B(?=ROM)', string)
if x:
x = x.group(0)

# print("x is", x)

if "GB" in x:
# get the word before GB
y = re.search(r'\w+(?=GB)', x).group(0)
return str(float(y)*1000)
if "MB" in x:
y = re.search(r'\w+(?=MB)', x).group(0)
return str(y)

print(f"Something weird happened with {string}")
return string

# else split the string and consider the first word
s_string = string.split()
if len(s_string) < 2:
return str(0)

ret = 0
if "GB" in s_string[0] or "GB" in s_string[1]:
ret = float(s_string.split('GB')[0]) * 1000
# assume the word refers to the ROM
elif "MB" in s_string[0] or "GB" in s_string[1]:
ret = float(s_string.split('MB')[0])

return str(ret)


def extract_cpu(df):
"""
Split 'platform_cpu' to 'core_count' and 'clock_speed', drop 'platform_cpu'
Expand Down Expand Up @@ -197,7 +303,8 @@ def get_clk_speed(string):

# Convert strings that have 'ghz' to '1000*x mhx' where x is in ghz.
def ghz_to_mhz(string):
if not string or 'ghz' not in string: return string
if not string or 'ghz' not in string:
return string

temp = float(string.split()[0])

Expand All @@ -220,15 +327,15 @@ def extract_price(string):
# case 0: EUR is present -> convert to usd
if "EUR" in string:
price = re.search("\d+\.{0,1}\d+", string)
if price:
if price:
final_price = float(price.group(0)) * 1.18

# case 1: INDR (rupees) is present -> convert to usd
elif "INR" in string:
price = re.search("\d+\.{0,1}\d+", string)
if price:
if price:
final_price = float(price.group(0)) * 0.013

elif "USD" in string:
price = re.search("\d+\.{0,1}\d+", string)
if price:
Expand All @@ -245,11 +352,15 @@ def extract_price(string):


# Function Map
f_map = {"launch_announced": launch_announced, "launch_status": available_discontinued,
"body_dimensions": squared_dimensions, "comms_wlan": wlan,
"comms_usb": usb_type,
"features_sensors": sensor, "platform_os": os,
"platform_gpu": gpu_platform}
f_map = {"launch_announced": launch_announced,
"body_dimensions": squared_dimensions,
"features_sensors": sensor,
"platform_gpu": gpu_platform,
"main_camera_video": cam_vid,
"main_camera_single": cam_snap,
"selfie_camera_video": cam_vid,
"selfie_camera_single": cam_snap,
}


def clean_data(df):
Expand All @@ -263,19 +374,23 @@ def clean_data(df):
df = extract_f(df)
df = extract_cpu(df)
df = extract_screen_in(df)

df = extract_rom_ram(df)

# Retreive price
df["misc_price"] = df["misc_price"].apply(extract_price)

# Encode 'OEM' with label-encoder after lower().
oem = df.oem.apply(lambda string: ''.join(c for c in string if c.isalnum()).lower())
oem = df.oem.apply(lambda string: ''.join(
c for c in string if c.isalnum()).lower())
oem = oem.apply(lambda x: str(x))
enc = LabelEncoder()
df['oem'] = enc.fit_transform(oem)
df['oem'] = df.oem.apply(pd.to_numeric)

# x = input("Extraction over. Continue to imputing & null drop phase? [Any Key to Continue]: ")

# Impute missing data & remove outliers
df_ret = fill_gaps(df)
df_ret = fill_gaps(df.drop(cols_to_drop, axis=1))
df_ret.set_index('key_index')
print(df_ret.index)

Expand All @@ -286,7 +401,9 @@ def clean_data(df):
if __name__ == '__main__':

# Open Dataset
data = pd.read_csv('C:/Users/capta/Desktop/9417-Great-Group/ml_algorithms/dataset/GSMArena_dataset_2020.csv', index_col=0)
# NOTE: change the path to your own path.
data = pd.read_csv(
'C:/Users/capta/Desktop/9417-Great-Group/ml_algorithms/dataset/GSMArena_dataset_2020.csv', index_col=0)

# Extract relevant features (for now)
data_features = data[all_features]
Expand Down
32 changes: 21 additions & 11 deletions ml_algorithms/auxiliary/data_interpolate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""
Impute or Interpolate missing data according to categorical & numerical features.
Removes outliers according to m*IQR -> (m=1.5 default).
TODO: Add other functionality where fit.
"""
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
Expand All @@ -10,28 +9,32 @@


# Features
straight_features = ["memory_internal",
"main_camera_single", "main_camera_video",
"selfie_camera_video",
"selfie_camera_single", "battery"]

# straight features are straightforward to extract with \d+
# NOTE 1: the current 'straight' feature is battery which seems to be mostly im mAH
# NOTE 2: should still check if its actually 'mAH' and remove or convert if not
straight_features = ["battery"]

all_features = ["oem", "launch_announced", "launch_status", "body_dimensions", "display_size", "comms_wlan", "comms_usb",
"features_sensors", "platform_os", "platform_cpu", "platform_gpu", "memory_internal",
"main_camera_single", "main_camera_video", "misc_price",
"selfie_camera_video",
"selfie_camera_single", "battery"]

final_features = ["oem", "launch_announced", "launch_status", "body_dimensions", "screen_size", "scn_bdy_ratio", "comms_wlan", "comms_usb",
"features_sensors", "platform_os", "core_count", "clock_speed", "platform_gpu", "memory_internal",
final_features = ["oem", "launch_announced", "body_dimensions", "screen_size", "scn_bdy_ratio",
"features_sensors", "clock_speed", "platform_gpu", "ram", "rom",
"main_camera_single", "main_camera_video", "misc_price",
"selfie_camera_video",
"selfie_camera_single", "battery"]

cols_to_drop = ['launch_status',
'comms_wlan', 'comms_usb', 'platform_os', 'core_count']

numeric_features = ["body_dimensions", "screen_size", "scn_bdy_ratio", "clock_speed", "memory_internal",
"main_camera_single", "main_camera_video", "misc_price",
"selfie_camera_video",
"selfie_camera_single", "battery"]

# TODO: there are some categorical features like 'main_camera_features' & etc. These features should also be included.

def rem_outliers(df):
for feature in numeric_features:
Expand All @@ -51,8 +54,6 @@ def rem_outliers(df):
return df


# TODO: Since we are basically trying to interpolate over 90% of the missing data points
# It may be better to just use the ~700-800 examples available after outlier removal.
def fill_gaps(df):
# NOTE: Can also use some interpolation (linear, cubic) instead.
i_imp = IterativeImputer(max_iter=20, random_state=6)
Expand All @@ -68,8 +69,17 @@ def fill_gaps(df):
# Remove outliers for each column, if they are 1.5X IQR for the column.
# df_ret = rem_outliers(df_ret)

# x = input("About to Impute, press any key to continue: ")

# Impute missing data, i.e. NaN.
# df_ret[df_ret.columns] = s_imp.fit_transform(df_ret[df_ret.columns])
df_impute = pd.DataFrame(s_imp.fit_transform(df_ret))
df_impute.columns = df_ret.columns
df_impute.index = df_ret.index

print("Dimensions of imputed df", df_impute.shape[0], df_impute.shape[1])

df_impute.to_csv('imputed_df.csv')
print("DF has been output to imputed_df.csv")

# TODO: Apply smoothing function (exponential, gaussian).

Expand Down