hyperswine · hyperswine · Aug 5, 2020 · Aug 5, 2020 · Aug 5, 2020 · Aug 5, 2020
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,4 @@ __pycache__
 
 # ADD ANY OTHER DIRECTORIES/FILES HERE
 *Report*
+*imputed*
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Features that didn't seem as necessary included the device's 'launch status', co
 
 #### Performance of sklearn's algorithms
 
-As shown in the results of the Random Forest classifier by sklearn, we have a very high accuracy of 71%. The relationship seems to be either quite complex or perhaps something was missed. The question of whether the data is in the right form is also raised. The much poorer performance of the Multiple Layer Perceptron at 55% accuracy is also concerning.
+As shown in the results of the Random Forest classifier by sklearn, we have an accuracy of 80%. The relationship seems to be either quite complex or perhaps something was missed. The question of whether the data is in the right form is also raised. The much poorer performance of the Multiple Layer Perceptron at 10% accuracy is also concerning.
 
 The question now is: What really is the relationship between a mobile device's technical specifications and its price? Is it linear, exponential or perhaps much more complex? 
 

diff --git a/ml_algorithms/LR.ipynb b/ml_algorithms/LR.ipynb
@@ -223,9 +223,9 @@
     "\n",
     "y = df[\"misc_price\"]\n",
     "X = df.drop([\"key_index\", \"misc_price\"], axis=1)\n",
-    "X = X.drop(['launch_status', 'comms_wlan', 'comms_usb', 'platform_os', 'core_count'], axis=1)\n",
+    "# X = X.drop(['launch_status', 'comms_wlan', 'comms_usb', 'platform_os', 'core_count'], axis=1)\n",
     "\n",
-    "df_lr = df.drop(['key_index','launch_status', 'comms_wlan', 'comms_usb', 'platform_os', 'core_count'], axis=1)\n",
+    "# df_lr = df.drop(['key_index','launch_status', 'comms_wlan', 'comms_usb', 'platform_os', 'core_count'], axis=1)\n",
     "\n",
     "# Train & test split. Seed = 120 for reproducing same shuffling of indices.\n",
     "# Note 70-30 split for the preliminary split.\n",

diff --git a/ml_algorithms/auxiliary/data_clean2.py b/ml_algorithms/auxiliary/data_clean2.py
@@ -12,7 +12,9 @@
 
 from .data_interpolate import *
 
-# TODO: change this such that cameras are '0' if we cannot regex them.
+# TODO: Other features like 'body_weight' and 'body_sim' could be included.
+
+
 def extract_straight(df):
     for feature in straight_features:
         s = df[feature]
@@ -85,25 +87,38 @@ def sensor(string):
     sensors = ['accelerometer', 'proximity',
                'compass', 'gyro', 'fingerprint', 'barometer']
 
-    return len([ sensor for sensor in sensors if (sensor in string.lower()) ])
+    return len([sensor for sensor in sensors if (sensor in string.lower())])
 
 
+# TODO: fix this. This should check whether '4K' or '2K' appear first before '1080p' etc.
+# Return the value '1080', '2160' if found, with no 'p'
 def cam_vid(string):
-    string = str(string)
+    if not string or type(string)==float:
+        return str(0)
 
-    p_string = re.search(r"\d+p", string)
-    k_string = ""
-
-    if not p_string:
-        k_string = re.search(r"\d+", string.lower())
-        if not k_string:
-            return string
+    k_string = re.search(r"\d+k", string.lower())
+    if k_string:
         if '2' in k_string.group(0):
-            return '1080p'
+            return '1080'
         if '4' in k_string.group(0):
-            return '2160p'
+            return '2160'
+
+    p_string = re.search(r"\d+(?=p)", string)
 
-    return string
+    if p_string:
+        return p_string.group(0)
+
+    return None
+
+
+# The pattern is 'x MP'. Do not accept any other pattern.
+def cam_snap(string):
+    if not string or type(string)==float:
+        return str(0)
+
+    mp_string = re.search(r"\d+\.{0,1}\d+(?=mp)", string.lower().replace(" ", ""))
+
+    return mp_string.group(0) if mp_string else None
 
 
 def os(string):
@@ -136,11 +151,11 @@ def extract_screen_in(df):
     """
     # Regex for screen size in inches
     df['screen_size'] = df['display_size'].apply(
-        lambda x: re.search(r'^.*(?=( inches))', str(x).lower()) )
+        lambda x: re.search(r'^.*(?=( inches))', str(x).lower()))
 
     # Regex for screen-body ratio
     df['scn_bdy_ratio'] = df['display_size'].apply(
-        lambda x: re.search(r'\d{1,2}.\d(?=%)', str(x).lower()) )
+        lambda x: re.search(r'\d{1,2}.\d(?=%)', str(x).lower()))
 
     # Apply results, NOTE: pandas doesn't like it when we're applying to multiple series.
     results1 = df['scn_bdy_ratio'].apply(lambda y: y.group(0) if y else None)
@@ -164,6 +179,97 @@ def core_count(string):
     return str(count)
 
 
+# NOTE accept MB or GB only.
+def extract_rom_ram(df):
+    """
+    Split memory internal to 'ram' and 'rom'.
+    There is some boiler-plate code in the get_ram/rom functions & someone can abstract it if they want to.
+    """
+    # Get ROM in MB
+    df['rom'] = df['memory_internal'].apply(get_rom)
+
+    # Get RAM in MB
+    df['ram'] = df['memory_internal'].apply(get_ram)
+
+    return df.drop(['memory_internal'], axis=1)
+
+
+# Return the ram in MB
+def get_ram(string):
+    # float means that string is 'NaN'
+    if type(string)==float:
+        return None
+
+    # print("string is,", string)
+
+    # get rid of spaces
+    string = string.replace(" ", "")
+
+    if "RAM" in string:
+        x = re.search(r'\d+(G|M)B(?=RAM)', string)
+        if x:
+            x = x.group(0)
+            # print("x is", x)
+
+            if "GB" in x:
+                # get the word before GB
+                y = re.search(r'\w+(?=GB)', x).group(0)
+                return str(float(y)*1000)
+            if "MB" in x:
+                y = re.search(r'\w+(?=MB)', x).group(0)
+                return str(y)
+
+            print(f"Something weird happened with {string}")
+            return string
+
+    return str(0)
+
+
+# Return the rom in MB
+def get_rom(string):
+    # float means that string is 'NaN'
+    if type(string)==float:
+        return None
+
+    # print("string is,", string)
+
+    # get rid of spaces
+    string = string.replace(" ", "")
+
+    if "ROM" in string:
+        # TODO: error - > "nothing to repeat at position 14?"
+        x = re.search(r'\d+(G|M)B(?=ROM)', string)
+        if x:
+            x = x.group(0)
+
+            # print("x is", x)
+
+            if "GB" in x:
+                # get the word before GB
+                y = re.search(r'\w+(?=GB)', x).group(0)
+                return str(float(y)*1000)
+            if "MB" in x:
+                y = re.search(r'\w+(?=MB)', x).group(0)
+                return str(y)
+
+            print(f"Something weird happened with {string}")
+            return string
+
+    # else split the string and consider the first word
+    s_string = string.split()
+    if len(s_string) < 2:
+         return str(0)
+
+    ret = 0
+    if "GB" in s_string[0] or "GB" in s_string[1]:
+        ret = float(s_string.split('GB')[0]) * 1000
+    # assume the word refers to the ROM
+    elif "MB" in s_string[0] or "GB" in s_string[1]:
+        ret = float(s_string.split('MB')[0])
+
+    return str(ret)
+
+
 def extract_cpu(df):
     """
     Split 'platform_cpu' to 'core_count' and 'clock_speed', drop 'platform_cpu'
@@ -197,7 +303,8 @@ def get_clk_speed(string):
 
 # Convert strings that have 'ghz' to '1000*x mhx' where x is in ghz.
 def ghz_to_mhz(string):
-    if not string or 'ghz' not in string: return string
+    if not string or 'ghz' not in string:
+        return string
 
     temp = float(string.split()[0])
 
@@ -220,15 +327,15 @@ def extract_price(string):
     # case 0: EUR is present -> convert to usd
     if "EUR" in string:
         price = re.search("\d+\.{0,1}\d+", string)
-        if price: 
+        if price:
             final_price = float(price.group(0)) * 1.18
 
     # case 1: INDR (rupees) is present -> convert to usd
     elif "INR" in string:
         price = re.search("\d+\.{0,1}\d+", string)
-        if price: 
+        if price:
             final_price = float(price.group(0)) * 0.013
-    
+
     elif "USD" in string:
         price = re.search("\d+\.{0,1}\d+", string)
         if price:
@@ -245,11 +352,15 @@ def extract_price(string):
 
 
 # Function Map
-f_map = {"launch_announced": launch_announced, "launch_status": available_discontinued,
-         "body_dimensions": squared_dimensions, "comms_wlan": wlan,
-         "comms_usb": usb_type,
-         "features_sensors": sensor, "platform_os": os,
-         "platform_gpu": gpu_platform}
+f_map = {"launch_announced": launch_announced,
+         "body_dimensions": squared_dimensions,
+         "features_sensors": sensor,
+         "platform_gpu": gpu_platform,
+         "main_camera_video": cam_vid,
+         "main_camera_single": cam_snap,
+         "selfie_camera_video": cam_vid,
+         "selfie_camera_single": cam_snap,
+         }
 
 
 def clean_data(df):
@@ -263,19 +374,23 @@ def clean_data(df):
     df = extract_f(df)
     df = extract_cpu(df)
     df = extract_screen_in(df)
-
+    df = extract_rom_ram(df)
+
     # Retreive price
     df["misc_price"] = df["misc_price"].apply(extract_price)
 
     # Encode 'OEM' with label-encoder after lower().
-    oem = df.oem.apply(lambda string: ''.join(c for c in string if c.isalnum()).lower())
+    oem = df.oem.apply(lambda string: ''.join(
+        c for c in string if c.isalnum()).lower())
     oem = oem.apply(lambda x: str(x))
     enc = LabelEncoder()
     df['oem'] = enc.fit_transform(oem)
     df['oem'] = df.oem.apply(pd.to_numeric)
 
+    # x = input("Extraction over. Continue to imputing & null drop phase? [Any Key to Continue]: ")
+
     # Impute missing data & remove outliers
-    df_ret = fill_gaps(df)
+    df_ret = fill_gaps(df.drop(cols_to_drop, axis=1))
     df_ret.set_index('key_index')
     print(df_ret.index)
 
@@ -286,7 +401,9 @@ def clean_data(df):
 if __name__ == '__main__':
 
     # Open Dataset
-    data = pd.read_csv('C:/Users/capta/Desktop/9417-Great-Group/ml_algorithms/dataset/GSMArena_dataset_2020.csv', index_col=0)
+    # NOTE: change the path to your own path.
+    data = pd.read_csv(
+        'C:/Users/capta/Desktop/9417-Great-Group/ml_algorithms/dataset/GSMArena_dataset_2020.csv', index_col=0)
 
     # Extract relevant features (for now)
     data_features = data[all_features]

diff --git a/ml_algorithms/auxiliary/data_interpolate.py b/ml_algorithms/auxiliary/data_interpolate.py
@@ -1,7 +1,6 @@
 """
 Impute or Interpolate missing data according to categorical & numerical features.
 Removes outliers according to m*IQR -> (m=1.5 default).
-TODO: Add other functionality where fit.
 """
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import SimpleImputer, IterativeImputer
@@ -10,28 +9,32 @@
 
 
 # Features
-straight_features = ["memory_internal",
-                     "main_camera_single", "main_camera_video",
-                     "selfie_camera_video",
-                     "selfie_camera_single", "battery"]
+
+# straight features are straightforward to extract with \d+
+# NOTE 1: the current 'straight' feature is battery which seems to be mostly im mAH
+# NOTE 2: should still check if its actually 'mAH' and remove or convert if not
+straight_features = ["battery"]
 
 all_features = ["oem", "launch_announced", "launch_status", "body_dimensions", "display_size", "comms_wlan", "comms_usb",
                 "features_sensors", "platform_os", "platform_cpu", "platform_gpu", "memory_internal",
                 "main_camera_single", "main_camera_video", "misc_price",
                 "selfie_camera_video",
                 "selfie_camera_single", "battery"]
 
-final_features = ["oem", "launch_announced", "launch_status", "body_dimensions", "screen_size", "scn_bdy_ratio", "comms_wlan", "comms_usb",
-                  "features_sensors", "platform_os", "core_count", "clock_speed", "platform_gpu", "memory_internal",
+final_features = ["oem", "launch_announced", "body_dimensions", "screen_size", "scn_bdy_ratio",
+                  "features_sensors", "clock_speed", "platform_gpu", "ram", "rom",
                   "main_camera_single", "main_camera_video", "misc_price",
                   "selfie_camera_video",
                   "selfie_camera_single", "battery"]
 
+cols_to_drop = ['launch_status',
+                'comms_wlan', 'comms_usb', 'platform_os', 'core_count']
+
 numeric_features = ["body_dimensions", "screen_size", "scn_bdy_ratio", "clock_speed", "memory_internal",
                     "main_camera_single", "main_camera_video", "misc_price",
                     "selfie_camera_video",
                     "selfie_camera_single", "battery"]
-
+# TODO: there are some categorical features like 'main_camera_features' & etc. These features should also be included.
 
 def rem_outliers(df):
     for feature in numeric_features:
@@ -51,8 +54,6 @@ def rem_outliers(df):
     return df
 
 
-# TODO: Since we are basically trying to interpolate over 90% of the missing data points
-# It may be better to just use the ~700-800 examples available after outlier removal.
 def fill_gaps(df):
     # NOTE: Can also use some interpolation (linear, cubic) instead.
     i_imp = IterativeImputer(max_iter=20, random_state=6)
@@ -68,8 +69,17 @@ def fill_gaps(df):
     # Remove outliers for each column, if they are 1.5X IQR for the column.
     # df_ret = rem_outliers(df_ret)
 
+    # x = input("About to Impute, press any key to continue: ")
+
     # Impute missing data, i.e. NaN.
-    # df_ret[df_ret.columns] = s_imp.fit_transform(df_ret[df_ret.columns])
+    df_impute = pd.DataFrame(s_imp.fit_transform(df_ret))
+    df_impute.columns = df_ret.columns
+    df_impute.index = df_ret.index
+
+    print("Dimensions of imputed df", df_impute.shape[0], df_impute.shape[1])
+
+    df_impute.to_csv('imputed_df.csv')
+    print("DF has been output to imputed_df.csv")
 
     # TODO: Apply smoothing function (exponential, gaussian).
Original file line number	Diff line number	Diff line change
Expand Up		@@ -19,3 +19,4 @@ __pycache__

		# ADD ANY OTHER DIRECTORIES/FILES HERE
		Report
		imputed