diff --git a/Yapiki_publ/Public_solution_KI_YP.ipynb b/Yapiki_publ/Public_solution_KI_YP.ipynb new file mode 100644 index 0000000..196a42e --- /dev/null +++ b/Yapiki_publ/Public_solution_KI_YP.ipynb @@ -0,0 +1,3330 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "78b262fa" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import typing\n", + "import torch\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML\n", + "from lightautoml.tasks import Task\n", + "\n", + "import phik\n", + "from phik.report import plot_correlation_matrix\n", + "from phik import report" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "5114ddf7" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((279792, 77), (2974, 76))" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_data = pd.read_csv('data/train.csv')\n", + "test_data = pd.read_csv('data/test.csv')\n", + "train_data.shape, test_data.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EDA analisys" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Анализ отчет pandas_profiling (! big size file) https://drive.google.com/file/d/1xQl3LvpX9J0G6gJoaBjzRcBFKZi6QZXz/view?usp=sharing" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "for col in train_data.select_dtypes(include=np.number).columns:\n", + " train_data[col] = pd.to_numeric(train_data[col], downcast = 'unsigned')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
latlngosm_amenity_points_in_0.001osm_amenity_points_in_0.005osm_amenity_points_in_0.0075osm_amenity_points_in_0.01osm_building_points_in_0.001osm_building_points_in_0.005osm_building_points_in_0.0075osm_building_points_in_0.01...reform_count_of_houses_500reform_house_population_1000reform_house_population_500reform_mean_floor_count_1000reform_mean_floor_count_500reform_mean_year_building_1000reform_mean_year_building_500total_squarerealty_typeprice_type
count279792.000000279792.000000279792.000000279792.000000279792.000000279792.000000279792.000000279792.000000279792.000000279792.000000...279792.000000265196.000000252558.000000263084.000000249624.000000263553.000000250155.000000279792.000000279792.000000279792.000000
mean54.36407847.7635402.70908440.60514681.596171133.2854580.0374420.8857012.0464673.748163...30.1106612042.541716644.6105577.0512337.3604641967.5325991967.988580507.83360454.9740880.016058
std4.24571317.0446254.20245153.293388105.193169172.2901360.3910146.85833814.80156625.679859...27.6862341359.884747445.6993293.5420844.23136945.80769954.1100151704.25177147.8564170.125700
min42.65189719.8921780.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000001.0000001.0000005.10000010.0000000.000000
25%53.22660037.5829880.0000007.00000016.00000028.0000000.0000000.0000000.0000000.000000...10.000000932.000000290.0000004.5918374.6199591960.0700001959.89009765.90000010.0000000.000000
50%55.67909039.7024351.00000022.00000046.00000077.0000000.0000000.0000000.0000000.000000...25.0000001949.000000602.0000006.3689326.3953491970.8904111971.647059128.73703410.0000000.000000
75%56.30697655.9575234.00000051.000000101.000000164.0000000.0000000.0000001.0000002.000000...43.0000002978.000000936.0000008.6989259.1000001983.7017541986.950000336.000000110.0000000.000000
max69.500740151.77700046.000000468.000000851.0000001392.00000030.000000586.000000949.0000001162.000000...289.00000018392.0000006105.00000053.717949221.6666672019.0000002020.00000040000.000000110.0000001.000000
\n", + "

8 rows × 70 columns

\n", + "
" + ], + "text/plain": [ + " lat lng osm_amenity_points_in_0.001 \\\n", + "count 279792.000000 279792.000000 279792.000000 \n", + "mean 54.364078 47.763540 2.709084 \n", + "std 4.245713 17.044625 4.202451 \n", + "min 42.651897 19.892178 0.000000 \n", + "25% 53.226600 37.582988 0.000000 \n", + "50% 55.679090 39.702435 1.000000 \n", + "75% 56.306976 55.957523 4.000000 \n", + "max 69.500740 151.777000 46.000000 \n", + "\n", + " osm_amenity_points_in_0.005 osm_amenity_points_in_0.0075 \\\n", + "count 279792.000000 279792.000000 \n", + "mean 40.605146 81.596171 \n", + "std 53.293388 105.193169 \n", + "min 0.000000 0.000000 \n", + "25% 7.000000 16.000000 \n", + "50% 22.000000 46.000000 \n", + "75% 51.000000 101.000000 \n", + "max 468.000000 851.000000 \n", + "\n", + " osm_amenity_points_in_0.01 osm_building_points_in_0.001 \\\n", + "count 279792.000000 279792.000000 \n", + "mean 133.285458 0.037442 \n", + "std 172.290136 0.391014 \n", + "min 0.000000 0.000000 \n", + "25% 28.000000 0.000000 \n", + "50% 77.000000 0.000000 \n", + "75% 164.000000 0.000000 \n", + "max 1392.000000 30.000000 \n", + "\n", + " osm_building_points_in_0.005 osm_building_points_in_0.0075 \\\n", + "count 279792.000000 279792.000000 \n", + "mean 0.885701 2.046467 \n", + "std 6.858338 14.801566 \n", + "min 0.000000 0.000000 \n", + "25% 0.000000 0.000000 \n", + "50% 0.000000 0.000000 \n", + "75% 0.000000 1.000000 \n", + "max 586.000000 949.000000 \n", + "\n", + " osm_building_points_in_0.01 ... reform_count_of_houses_500 \\\n", + "count 279792.000000 ... 279792.000000 \n", + "mean 3.748163 ... 30.110661 \n", + "std 25.679859 ... 27.686234 \n", + "min 0.000000 ... 0.000000 \n", + "25% 0.000000 ... 10.000000 \n", + "50% 0.000000 ... 25.000000 \n", + "75% 2.000000 ... 43.000000 \n", + "max 1162.000000 ... 289.000000 \n", + "\n", + " reform_house_population_1000 reform_house_population_500 \\\n", + "count 265196.000000 252558.000000 \n", + "mean 2042.541716 644.610557 \n", + "std 1359.884747 445.699329 \n", + "min 0.000000 0.000000 \n", + "25% 932.000000 290.000000 \n", + "50% 1949.000000 602.000000 \n", + "75% 2978.000000 936.000000 \n", + "max 18392.000000 6105.000000 \n", + "\n", + " reform_mean_floor_count_1000 reform_mean_floor_count_500 \\\n", + "count 263084.000000 249624.000000 \n", + "mean 7.051233 7.360464 \n", + "std 3.542084 4.231369 \n", + "min 0.000000 0.000000 \n", + "25% 4.591837 4.619959 \n", + "50% 6.368932 6.395349 \n", + "75% 8.698925 9.100000 \n", + "max 53.717949 221.666667 \n", + "\n", + " reform_mean_year_building_1000 reform_mean_year_building_500 \\\n", + "count 263553.000000 250155.000000 \n", + "mean 1967.532599 1967.988580 \n", + "std 45.807699 54.110015 \n", + "min 1.000000 1.000000 \n", + "25% 1960.070000 1959.890097 \n", + "50% 1970.890411 1971.647059 \n", + "75% 1983.701754 1986.950000 \n", + "max 2019.000000 2020.000000 \n", + "\n", + " total_square realty_type price_type \n", + "count 279792.000000 279792.000000 279792.000000 \n", + "mean 507.833604 54.974088 0.016058 \n", + "std 1704.251771 47.856417 0.125700 \n", + "min 5.100000 10.000000 0.000000 \n", + "25% 65.900000 10.000000 0.000000 \n", + "50% 128.737034 10.000000 0.000000 \n", + "75% 336.000000 110.000000 0.000000 \n", + "max 40000.000000 110.000000 1.000000 \n", + "\n", + "[8 rows x 70 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_data.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def check_features(df):\n", + " return pd.DataFrame({'unique_values': df.nunique(),'type': df.dtypes,'pct_missing': df.isna().sum()/len(df) * 100}).sort_values(by = 'pct_missing', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
floorreform_mean_floor_count_500reform_mean_year_building_500reform_house_population_500reform_mean_floor_count_1000reform_mean_year_building_1000reform_house_population_1000streetosm_city_nearest_populationregion...osm_finance_points_in_0.005osm_finance_points_in_0.001osm_culture_points_in_0.01osm_culture_points_in_0.0075osm_culture_points_in_0.005osm_culture_points_in_0.001osm_crossing_points_in_0.01osm_crossing_points_in_0.0075osm_crossing_points_in_0.005price_type
unique_values20617121431432366490177604462062884116949...297216159111162681911082
typeobjectfloat64float64float64float64float64float64objectfloat64object...uint8uint8uint16uint16uint8uint8uint16uint8uint8uint8
pct_missing62.988610.782310.59259.733665.971585.803955.216730.5739980.01965750...0000000000
\n", + "

3 rows × 77 columns

\n", + "
" + ], + "text/plain": [ + " floor reform_mean_floor_count_500 \\\n", + "unique_values 206 17121 \n", + "type object float64 \n", + "pct_missing 62.9886 10.7823 \n", + "\n", + " reform_mean_year_building_500 reform_house_population_500 \\\n", + "unique_values 43143 2366 \n", + "type float64 float64 \n", + "pct_missing 10.5925 9.73366 \n", + "\n", + " reform_mean_floor_count_1000 reform_mean_year_building_1000 \\\n", + "unique_values 49017 76044 \n", + "type float64 float64 \n", + "pct_missing 5.97158 5.80395 \n", + "\n", + " reform_house_population_1000 street \\\n", + "unique_values 6206 28841 \n", + "type float64 object \n", + "pct_missing 5.21673 0.573998 \n", + "\n", + " osm_city_nearest_population region ... \\\n", + "unique_values 169 49 ... \n", + "type float64 object ... \n", + "pct_missing 0.0196575 0 ... \n", + "\n", + " osm_finance_points_in_0.005 osm_finance_points_in_0.001 \\\n", + "unique_values 29 7 \n", + "type uint8 uint8 \n", + "pct_missing 0 0 \n", + "\n", + " osm_culture_points_in_0.01 osm_culture_points_in_0.0075 \\\n", + "unique_values 216 159 \n", + "type uint16 uint16 \n", + "pct_missing 0 0 \n", + "\n", + " osm_culture_points_in_0.005 osm_culture_points_in_0.001 \\\n", + "unique_values 111 16 \n", + "type uint8 uint8 \n", + "pct_missing 0 0 \n", + "\n", + " osm_crossing_points_in_0.01 osm_crossing_points_in_0.0075 \\\n", + "unique_values 268 191 \n", + "type uint16 uint8 \n", + "pct_missing 0 0 \n", + "\n", + " osm_crossing_points_in_0.005 price_type \n", + "unique_values 108 2 \n", + "type uint8 uint8 \n", + "pct_missing 0 0 \n", + "\n", + "[3 rows x 77 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "check_features(train_data).T" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([nan, 3.0, 4.0, -1.0, 1.0, 14.0, 2.0, 8.0, -2.0, 6.0, 10.0, 5.0,\n", + " 16.0, 19.0, 7.0, 9.0, 58.0, 24.0, 18.0, 26.0, 17.0, 48.0, 11.0,\n", + " -3.0, 15.0, 22.0, 60.0, 12.0, 21.0, 35.0, 28.0, 38.0, 39.0, 13.0,\n", + " 81.0, 44.0, 82.0, 25.0, 45.0, 47.0, 23.0, 37.0, 29.0, 113.0, 78.0,\n", + " 42.0, 69.0, 27.0, 46.0, 53.0, 80.0, 70.0, 76.0, 64.0, 30.0, 73.0,\n", + " 77.0, 52.0, 67.0, 65.0, 20.0, 40.0, 49.0, 75.0, 93.0, 94.0, 91.0,\n", + " 72.0, 79.0, 84.0, 92.0, 33.0, 66.0, 90.0, 31.0, 36.0, 61.0, 71.0,\n", + " 68.0, 51.0, 97.0, 43.0, 95.0, 85.0, 50.0, 0.0, 62.0, 54.0, 74.0,\n", + " 57.0, 41.0, 34.0, 59.0, 56.0, 123.0, 55.0, 83.0, '27.0', '1.0',\n", + " '5.0', '-1.0', '67.0', '2.0', '0.0', '4.0', '6.0', '3.0', '15.0',\n", + " '10.0', '11.0', '30.0', '12.0', '-2.0', '14.0', '36.0', '8.0',\n", + " '50.0', '17.0', '19.0', '37.0', '68.0', '7.0', '42.0', '9.0',\n", + " '16.0', '20.0', '53.0', '91.0', '84.0', '38.0', '21.0', '48.0',\n", + " '22.0', '23.0', '1', '18.0', 'подвал, 1', '2', 'подвал',\n", + " 'цоколь, 1', '1,2,антресоль', 'цоколь', '4', '5', 'тех.этаж (6)',\n", + " '3', 'Подвал', 'Цоколь', '10', 'фактически на уровне 1 этажа', '6',\n", + " '1,2,3', '1, подвал', '1,2,3,4', '1,2', '1,2,3,4,5', '5, мансарда',\n", + " '1-й, подвал', '12', '15', '13', '1, подвал, антресоль', 'мезонин',\n", + " 'подвал, 1-3', '8', '7', '1 (Цокольный этаж)',\n", + " '3, Мансарда (4 эт)', 'подвал,1', '1, антресоль', '1-3',\n", + " 'мансарда (4эт)', '1, 2.', '9', 'подвал , 1 ', '1, 2',\n", + " 'подвал, 1,2,3', '1 + подвал (без отделки)', 'мансарда', '2,3',\n", + " '4, 5', '1-й, 2-й', '18', '1 этаж, подвал', '1, цоколь',\n", + " 'подвал, 1-7, техэтаж', '3 (антресоль)', '1, 2, 3',\n", + " 'Цоколь, 1,2(мансарда)', 'подвал, 3. 4 этаж', 'подвал, 1-4 этаж',\n", + " 'подва, 1.2 этаж', '2, 3', '-1', '1.2', '11', '36', '7,8',\n", + " '1 этаж', '1-й', '3 этаж', '4 этаж', '5 этаж', 'подвал,1,2,3,4,5',\n", + " '29', 'подвал, цоколь, 1 этаж', '3, мансарда'], dtype=object)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_data['floor'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "#Вручную обработали столбец с этажами\n", + "for df in [train_data, test_data]:\n", + " df.replace('1', 1, inplace=True)\n", + " df.replace('1.0', 1, inplace=True)\n", + "\n", + "train_data['floor'] = train_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)\n", + "test_data['floor'] = test_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# вручную удалила повторяющиеся признаки с разным метражом, оставив только значение \"in 0.01\" для osm и \"1000\" для reform, чтобы матрица корреляции помещалась на экране\n", + "columns_for_pearson = [\n", + " 'per_square_meter_price', \n", + " 'city', 'floor',\n", + " 'osm_amenity_points_in_0.01', \n", + " 'osm_building_points_in_0.01', \n", + " 'osm_catering_points_in_0.01', \n", + " 'osm_city_closest_dist', \n", + " 'osm_city_nearest_name', \n", + " 'osm_city_nearest_population',\n", + " 'osm_crossing_closest_dist', \n", + " 'osm_crossing_points_in_0.01', \n", + " 'osm_culture_points_in_0.01',\n", + " 'osm_healthcare_points_in_0.01', \n", + " 'osm_historic_points_in_0.01', \n", + " 'osm_hotels_points_in_0.01',\n", + " 'osm_leisure_points_in_0.01', \n", + " 'osm_offices_points_in_0.01', \n", + " 'osm_shops_points_in_0.01', \n", + " 'osm_subway_closest_dist',\n", + " 'osm_train_stop_closest_dist', \n", + " 'osm_train_stop_points_in_0.01', \n", + " 'osm_transport_stop_closest_dist',\n", + " 'osm_transport_stop_points_in_0.01', \n", + " 'reform_count_of_houses_1000', \n", + " 'reform_house_population_1000',\n", + " 'reform_mean_floor_count_1000', \n", + " 'reform_mean_year_building_1000', \n", + " 'region', \n", + " 'total_square', \n", + " 'street', \n", + " 'date', \n", + " 'realty_type', \n", + " 'price_type']" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featurepearson
0per_square_meter_price1.00
6osm_city_nearest_population0.55
2osm_amenity_points_in_0.010.48
10osm_healthcare_points_in_0.010.46
4osm_catering_points_in_0.010.46
13osm_leisure_points_in_0.010.46
15osm_shops_points_in_0.010.44
20osm_transport_stop_points_in_0.010.43
8osm_crossing_points_in_0.010.43
14osm_offices_points_in_0.010.42
\n", + "
" + ], + "text/plain": [ + " feature pearson\n", + "0 per_square_meter_price 1.00\n", + "6 osm_city_nearest_population 0.55\n", + "2 osm_amenity_points_in_0.01 0.48\n", + "10 osm_healthcare_points_in_0.01 0.46\n", + "4 osm_catering_points_in_0.01 0.46\n", + "13 osm_leisure_points_in_0.01 0.46\n", + "15 osm_shops_points_in_0.01 0.44\n", + "20 osm_transport_stop_points_in_0.01 0.43\n", + "8 osm_crossing_points_in_0.01 0.43\n", + "14 osm_offices_points_in_0.01 0.42" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Находим признаки, имеющие максимальное значение коэффициента корреляции Пирсона с целевой переменной. \n", + "# В качестве отсечки выбрано значение 0.4 по модулю\n", + "pearson = train_data[columns_for_pearson].corr().round(2)\n", + "pearson_max_corr = (\n", + " pearson['per_square_meter_price'].to_frame().reset_index()\n", + " .rename(columns={'per_square_meter_price':'pearson', 'index':'feature'})\n", + " .sort_values(by='pearson', ascending=False)\n", + " .query('pearson > 0.4 or pearson < -0.4')\n", + " )\n", + "pearson_max_corr" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# fig, ax = plt.subplots(figsize=(20, 20))\n", + "# sns.heatmap(train_data[columns_for_pearson].corr().round(2), annot=True, square=True, cmap='mako')\n", + "# ax.set_title(label = 'МАТРИЦА КОРРЕЛЯЦИИ ПРИЗНАКОВ $r$', fontdict={'fontsize': 15, 'fontweight': 'bold'})\n", + "# plt.show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Высокая взаимная корреляция (> 0.8) у следующих переменных:\n", + "- amenity и catering, healthcare, office, shop\n", + "- catering и shop\n", + "- office и shop, catering\n", + "- healthcare и catering, office, shop,\n", + "- transport и crossing" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "# убрали city и street(закодирована)\n", + "\n", + "columns_for_phik = [\n", + " 'per_square_meter_price',\n", + " 'floor',\n", + " 'osm_amenity_points_in_0.01',\n", + " 'osm_building_points_in_0.01',\n", + " 'osm_catering_points_in_0.01',\n", + " 'osm_city_closest_dist',\n", + " 'osm_city_nearest_name',\n", + " 'osm_city_nearest_population',\n", + " 'osm_crossing_closest_dist',\n", + " 'osm_crossing_points_in_0.01',\n", + " 'osm_culture_points_in_0.01',\n", + " 'osm_healthcare_points_in_0.01',\n", + " 'osm_historic_points_in_0.01',\n", + " 'osm_hotels_points_in_0.01',\n", + " 'osm_leisure_points_in_0.01',\n", + " 'osm_offices_points_in_0.01',\n", + " 'osm_shops_points_in_0.01',\n", + " 'osm_subway_closest_dist',\n", + " 'osm_train_stop_closest_dist',\n", + " 'osm_train_stop_points_in_0.01',\n", + " 'osm_transport_stop_closest_dist',\n", + " 'osm_transport_stop_points_in_0.01',\n", + " 'reform_count_of_houses_1000',\n", + " 'reform_house_population_1000',\n", + " 'reform_mean_floor_count_1000',\n", + " 'reform_mean_year_building_1000',\n", + " 'region',\n", + " 'total_square',\n", + " 'date',\n", + " 'realty_type',\n", + " 'price_type']" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
per_square_meter_pricefloorosm_amenity_points_in_0.01osm_building_points_in_0.01osm_catering_points_in_0.01osm_city_closest_distosm_city_nearest_nameosm_city_nearest_populationosm_crossing_closest_distosm_crossing_points_in_0.01...osm_transport_stop_points_in_0.01reform_count_of_houses_1000reform_house_population_1000reform_mean_floor_count_1000reform_mean_year_building_1000regiontotal_squaredaterealty_typeprice_type
per_square_meter_price1.000.180.480.020.480.050.570.550.000.57...0.460.160.190.230.030.520.020.110.180.05
floor0.181.000.070.010.060.080.280.150.010.14...0.120.030.100.150.000.230.010.220.040.00
osm_amenity_points_in_0.010.480.071.000.020.960.070.560.490.000.82...0.690.630.620.250.040.450.050.070.180.22
osm_building_points_in_0.010.020.010.021.000.020.000.280.170.000.11...0.040.030.030.030.000.220.000.010.030.01
osm_catering_points_in_0.010.480.060.960.021.000.050.510.470.000.80...0.630.570.600.230.030.400.050.070.190.20
osm_city_closest_dist0.050.080.070.000.051.000.800.110.730.22...0.150.090.110.130.000.400.000.020.060.01
osm_city_nearest_name0.570.280.560.280.510.801.001.000.420.64...0.620.550.480.590.441.000.120.220.370.30
osm_city_nearest_population0.550.150.490.170.470.111.001.000.000.60...0.490.200.240.310.050.970.070.140.140.07
osm_crossing_closest_dist0.000.010.000.000.000.730.420.001.000.00...0.020.010.000.010.000.070.000.000.010.00
osm_crossing_points_in_0.010.570.140.820.110.800.220.640.600.001.00...0.770.670.590.410.170.550.180.170.310.09
osm_culture_points_in_0.010.310.040.610.000.600.010.460.220.000.62...0.340.370.150.120.000.390.040.030.070.04
osm_healthcare_points_in_0.010.460.100.850.020.800.070.560.490.000.81...0.680.530.420.230.050.460.060.070.130.04
osm_historic_points_in_0.010.380.050.750.010.730.040.450.350.000.76...0.520.520.190.190.040.330.080.050.140.05
osm_hotels_points_in_0.010.370.060.880.000.880.030.440.420.000.71...0.500.520.560.170.010.360.050.050.080.21
osm_leisure_points_in_0.010.470.100.720.030.680.070.570.500.000.71...0.580.400.390.270.050.480.060.080.140.09
osm_offices_points_in_0.010.470.080.890.020.870.050.570.520.000.76...0.610.560.590.270.040.470.050.070.190.20
osm_shops_points_in_0.010.470.070.960.020.920.070.570.490.000.79...0.670.640.650.250.060.470.030.070.170.20
osm_subway_closest_dist0.180.090.140.120.120.410.990.310.320.20...0.180.120.090.150.040.910.030.060.090.04
osm_train_stop_closest_dist0.020.020.030.000.020.730.880.030.790.09...0.060.030.050.070.000.430.000.010.020.00
osm_train_stop_points_in_0.010.270.020.330.000.340.010.320.290.000.46...0.250.120.130.120.050.260.040.040.110.03
osm_transport_stop_closest_dist0.000.010.000.000.000.730.250.010.880.00...0.020.010.010.020.000.140.000.000.020.00
osm_transport_stop_points_in_0.010.460.120.690.040.630.150.620.490.020.77...1.000.520.450.330.050.520.050.080.200.07
reform_count_of_houses_10000.160.030.630.030.570.090.550.200.010.67...0.521.000.550.380.060.430.050.030.130.13
reform_house_population_10000.190.100.620.030.600.110.480.240.000.59...0.450.551.000.310.040.360.020.040.200.16
reform_mean_floor_count_10000.230.150.250.030.230.130.590.310.010.41...0.330.380.311.000.080.470.020.060.140.05
reform_mean_year_building_10000.030.000.040.000.030.000.440.050.000.17...0.050.060.040.081.000.370.000.020.020.00
region0.520.230.450.220.400.401.000.970.070.55...0.520.430.360.470.371.000.080.170.280.27
total_square0.020.010.050.000.050.000.120.070.000.18...0.050.050.020.020.000.081.000.020.040.01
date0.110.220.070.010.070.020.220.140.000.17...0.080.030.040.060.020.170.021.000.100.02
realty_type0.180.040.180.030.190.060.370.140.010.31...0.200.130.200.140.020.280.040.101.000.02
price_type0.050.000.220.010.200.010.300.070.000.09...0.070.130.160.050.000.270.010.020.021.00
\n", + "

31 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " per_square_meter_price floor \\\n", + "per_square_meter_price 1.00 0.18 \n", + "floor 0.18 1.00 \n", + "osm_amenity_points_in_0.01 0.48 0.07 \n", + "osm_building_points_in_0.01 0.02 0.01 \n", + "osm_catering_points_in_0.01 0.48 0.06 \n", + "osm_city_closest_dist 0.05 0.08 \n", + "osm_city_nearest_name 0.57 0.28 \n", + "osm_city_nearest_population 0.55 0.15 \n", + "osm_crossing_closest_dist 0.00 0.01 \n", + "osm_crossing_points_in_0.01 0.57 0.14 \n", + "osm_culture_points_in_0.01 0.31 0.04 \n", + "osm_healthcare_points_in_0.01 0.46 0.10 \n", + "osm_historic_points_in_0.01 0.38 0.05 \n", + "osm_hotels_points_in_0.01 0.37 0.06 \n", + "osm_leisure_points_in_0.01 0.47 0.10 \n", + "osm_offices_points_in_0.01 0.47 0.08 \n", + "osm_shops_points_in_0.01 0.47 0.07 \n", + "osm_subway_closest_dist 0.18 0.09 \n", + "osm_train_stop_closest_dist 0.02 0.02 \n", + "osm_train_stop_points_in_0.01 0.27 0.02 \n", + "osm_transport_stop_closest_dist 0.00 0.01 \n", + "osm_transport_stop_points_in_0.01 0.46 0.12 \n", + "reform_count_of_houses_1000 0.16 0.03 \n", + "reform_house_population_1000 0.19 0.10 \n", + "reform_mean_floor_count_1000 0.23 0.15 \n", + "reform_mean_year_building_1000 0.03 0.00 \n", + "region 0.52 0.23 \n", + "total_square 0.02 0.01 \n", + "date 0.11 0.22 \n", + "realty_type 0.18 0.04 \n", + "price_type 0.05 0.00 \n", + "\n", + " osm_amenity_points_in_0.01 \\\n", + "per_square_meter_price 0.48 \n", + "floor 0.07 \n", + "osm_amenity_points_in_0.01 1.00 \n", + "osm_building_points_in_0.01 0.02 \n", + "osm_catering_points_in_0.01 0.96 \n", + "osm_city_closest_dist 0.07 \n", + "osm_city_nearest_name 0.56 \n", + "osm_city_nearest_population 0.49 \n", + "osm_crossing_closest_dist 0.00 \n", + "osm_crossing_points_in_0.01 0.82 \n", + "osm_culture_points_in_0.01 0.61 \n", + "osm_healthcare_points_in_0.01 0.85 \n", + "osm_historic_points_in_0.01 0.75 \n", + "osm_hotels_points_in_0.01 0.88 \n", + "osm_leisure_points_in_0.01 0.72 \n", + "osm_offices_points_in_0.01 0.89 \n", + "osm_shops_points_in_0.01 0.96 \n", + "osm_subway_closest_dist 0.14 \n", + "osm_train_stop_closest_dist 0.03 \n", + "osm_train_stop_points_in_0.01 0.33 \n", + "osm_transport_stop_closest_dist 0.00 \n", + "osm_transport_stop_points_in_0.01 0.69 \n", + "reform_count_of_houses_1000 0.63 \n", + "reform_house_population_1000 0.62 \n", + "reform_mean_floor_count_1000 0.25 \n", + "reform_mean_year_building_1000 0.04 \n", + "region 0.45 \n", + "total_square 0.05 \n", + "date 0.07 \n", + "realty_type 0.18 \n", + "price_type 0.22 \n", + "\n", + " osm_building_points_in_0.01 \\\n", + "per_square_meter_price 0.02 \n", + "floor 0.01 \n", + "osm_amenity_points_in_0.01 0.02 \n", + "osm_building_points_in_0.01 1.00 \n", + "osm_catering_points_in_0.01 0.02 \n", + "osm_city_closest_dist 0.00 \n", + "osm_city_nearest_name 0.28 \n", + "osm_city_nearest_population 0.17 \n", + "osm_crossing_closest_dist 0.00 \n", + "osm_crossing_points_in_0.01 0.11 \n", + "osm_culture_points_in_0.01 0.00 \n", + "osm_healthcare_points_in_0.01 0.02 \n", + "osm_historic_points_in_0.01 0.01 \n", + "osm_hotels_points_in_0.01 0.00 \n", + "osm_leisure_points_in_0.01 0.03 \n", + "osm_offices_points_in_0.01 0.02 \n", + "osm_shops_points_in_0.01 0.02 \n", + "osm_subway_closest_dist 0.12 \n", + "osm_train_stop_closest_dist 0.00 \n", + "osm_train_stop_points_in_0.01 0.00 \n", + "osm_transport_stop_closest_dist 0.00 \n", + "osm_transport_stop_points_in_0.01 0.04 \n", + "reform_count_of_houses_1000 0.03 \n", + "reform_house_population_1000 0.03 \n", + "reform_mean_floor_count_1000 0.03 \n", + "reform_mean_year_building_1000 0.00 \n", + "region 0.22 \n", + "total_square 0.00 \n", + "date 0.01 \n", + "realty_type 0.03 \n", + "price_type 0.01 \n", + "\n", + " osm_catering_points_in_0.01 \\\n", + "per_square_meter_price 0.48 \n", + "floor 0.06 \n", + "osm_amenity_points_in_0.01 0.96 \n", + "osm_building_points_in_0.01 0.02 \n", + "osm_catering_points_in_0.01 1.00 \n", + "osm_city_closest_dist 0.05 \n", + "osm_city_nearest_name 0.51 \n", + "osm_city_nearest_population 0.47 \n", + "osm_crossing_closest_dist 0.00 \n", + "osm_crossing_points_in_0.01 0.80 \n", + "osm_culture_points_in_0.01 0.60 \n", + "osm_healthcare_points_in_0.01 0.80 \n", + "osm_historic_points_in_0.01 0.73 \n", + "osm_hotels_points_in_0.01 0.88 \n", + "osm_leisure_points_in_0.01 0.68 \n", + "osm_offices_points_in_0.01 0.87 \n", + "osm_shops_points_in_0.01 0.92 \n", + "osm_subway_closest_dist 0.12 \n", + "osm_train_stop_closest_dist 0.02 \n", + "osm_train_stop_points_in_0.01 0.34 \n", + "osm_transport_stop_closest_dist 0.00 \n", + "osm_transport_stop_points_in_0.01 0.63 \n", + "reform_count_of_houses_1000 0.57 \n", + "reform_house_population_1000 0.60 \n", + "reform_mean_floor_count_1000 0.23 \n", + "reform_mean_year_building_1000 0.03 \n", + "region 0.40 \n", + "total_square 0.05 \n", + "date 0.07 \n", + "realty_type 0.19 \n", + "price_type 0.20 \n", + "\n", + " osm_city_closest_dist \\\n", + "per_square_meter_price 0.05 \n", + "floor 0.08 \n", + "osm_amenity_points_in_0.01 0.07 \n", + "osm_building_points_in_0.01 0.00 \n", + "osm_catering_points_in_0.01 0.05 \n", + "osm_city_closest_dist 1.00 \n", + "osm_city_nearest_name 0.80 \n", + "osm_city_nearest_population 0.11 \n", + "osm_crossing_closest_dist 0.73 \n", + "osm_crossing_points_in_0.01 0.22 \n", + "osm_culture_points_in_0.01 0.01 \n", + "osm_healthcare_points_in_0.01 0.07 \n", + "osm_historic_points_in_0.01 0.04 \n", + "osm_hotels_points_in_0.01 0.03 \n", + "osm_leisure_points_in_0.01 0.07 \n", + "osm_offices_points_in_0.01 0.05 \n", + "osm_shops_points_in_0.01 0.07 \n", + "osm_subway_closest_dist 0.41 \n", + "osm_train_stop_closest_dist 0.73 \n", + "osm_train_stop_points_in_0.01 0.01 \n", + "osm_transport_stop_closest_dist 0.73 \n", + "osm_transport_stop_points_in_0.01 0.15 \n", + "reform_count_of_houses_1000 0.09 \n", + "reform_house_population_1000 0.11 \n", + "reform_mean_floor_count_1000 0.13 \n", + "reform_mean_year_building_1000 0.00 \n", + "region 0.40 \n", + "total_square 0.00 \n", + "date 0.02 \n", + "realty_type 0.06 \n", + "price_type 0.01 \n", + "\n", + " osm_city_nearest_name \\\n", + "per_square_meter_price 0.57 \n", + "floor 0.28 \n", + "osm_amenity_points_in_0.01 0.56 \n", + "osm_building_points_in_0.01 0.28 \n", + "osm_catering_points_in_0.01 0.51 \n", + "osm_city_closest_dist 0.80 \n", + "osm_city_nearest_name 1.00 \n", + "osm_city_nearest_population 1.00 \n", + "osm_crossing_closest_dist 0.42 \n", + "osm_crossing_points_in_0.01 0.64 \n", + "osm_culture_points_in_0.01 0.46 \n", + "osm_healthcare_points_in_0.01 0.56 \n", + "osm_historic_points_in_0.01 0.45 \n", + "osm_hotels_points_in_0.01 0.44 \n", + "osm_leisure_points_in_0.01 0.57 \n", + "osm_offices_points_in_0.01 0.57 \n", + "osm_shops_points_in_0.01 0.57 \n", + "osm_subway_closest_dist 0.99 \n", + "osm_train_stop_closest_dist 0.88 \n", + "osm_train_stop_points_in_0.01 0.32 \n", + "osm_transport_stop_closest_dist 0.25 \n", + "osm_transport_stop_points_in_0.01 0.62 \n", + "reform_count_of_houses_1000 0.55 \n", + "reform_house_population_1000 0.48 \n", + "reform_mean_floor_count_1000 0.59 \n", + "reform_mean_year_building_1000 0.44 \n", + "region 1.00 \n", + "total_square 0.12 \n", + "date 0.22 \n", + "realty_type 0.37 \n", + "price_type 0.30 \n", + "\n", + " osm_city_nearest_population \\\n", + "per_square_meter_price 0.55 \n", + "floor 0.15 \n", + "osm_amenity_points_in_0.01 0.49 \n", + "osm_building_points_in_0.01 0.17 \n", + "osm_catering_points_in_0.01 0.47 \n", + "osm_city_closest_dist 0.11 \n", + "osm_city_nearest_name 1.00 \n", + "osm_city_nearest_population 1.00 \n", + "osm_crossing_closest_dist 0.00 \n", + "osm_crossing_points_in_0.01 0.60 \n", + "osm_culture_points_in_0.01 0.22 \n", + "osm_healthcare_points_in_0.01 0.49 \n", + "osm_historic_points_in_0.01 0.35 \n", + "osm_hotels_points_in_0.01 0.42 \n", + "osm_leisure_points_in_0.01 0.50 \n", + "osm_offices_points_in_0.01 0.52 \n", + "osm_shops_points_in_0.01 0.49 \n", + "osm_subway_closest_dist 0.31 \n", + "osm_train_stop_closest_dist 0.03 \n", + "osm_train_stop_points_in_0.01 0.29 \n", + "osm_transport_stop_closest_dist 0.01 \n", + "osm_transport_stop_points_in_0.01 0.49 \n", + "reform_count_of_houses_1000 0.20 \n", + "reform_house_population_1000 0.24 \n", + "reform_mean_floor_count_1000 0.31 \n", + "reform_mean_year_building_1000 0.05 \n", + "region 0.97 \n", + "total_square 0.07 \n", + "date 0.14 \n", + "realty_type 0.14 \n", + "price_type 0.07 \n", + "\n", + " osm_crossing_closest_dist \\\n", + "per_square_meter_price 0.00 \n", + "floor 0.01 \n", + "osm_amenity_points_in_0.01 0.00 \n", + "osm_building_points_in_0.01 0.00 \n", + "osm_catering_points_in_0.01 0.00 \n", + "osm_city_closest_dist 0.73 \n", + "osm_city_nearest_name 0.42 \n", + "osm_city_nearest_population 0.00 \n", + "osm_crossing_closest_dist 1.00 \n", + "osm_crossing_points_in_0.01 0.00 \n", + "osm_culture_points_in_0.01 0.00 \n", + "osm_healthcare_points_in_0.01 0.00 \n", + "osm_historic_points_in_0.01 0.00 \n", + "osm_hotels_points_in_0.01 0.00 \n", + "osm_leisure_points_in_0.01 0.00 \n", + "osm_offices_points_in_0.01 0.00 \n", + "osm_shops_points_in_0.01 0.00 \n", + "osm_subway_closest_dist 0.32 \n", + "osm_train_stop_closest_dist 0.79 \n", + "osm_train_stop_points_in_0.01 0.00 \n", + "osm_transport_stop_closest_dist 0.88 \n", + "osm_transport_stop_points_in_0.01 0.02 \n", + "reform_count_of_houses_1000 0.01 \n", + "reform_house_population_1000 0.00 \n", + "reform_mean_floor_count_1000 0.01 \n", + "reform_mean_year_building_1000 0.00 \n", + "region 0.07 \n", + "total_square 0.00 \n", + "date 0.00 \n", + "realty_type 0.01 \n", + "price_type 0.00 \n", + "\n", + " osm_crossing_points_in_0.01 ... \\\n", + "per_square_meter_price 0.57 ... \n", + "floor 0.14 ... \n", + "osm_amenity_points_in_0.01 0.82 ... \n", + "osm_building_points_in_0.01 0.11 ... \n", + "osm_catering_points_in_0.01 0.80 ... \n", + "osm_city_closest_dist 0.22 ... \n", + "osm_city_nearest_name 0.64 ... \n", + "osm_city_nearest_population 0.60 ... \n", + "osm_crossing_closest_dist 0.00 ... \n", + "osm_crossing_points_in_0.01 1.00 ... \n", + "osm_culture_points_in_0.01 0.62 ... \n", + "osm_healthcare_points_in_0.01 0.81 ... \n", + "osm_historic_points_in_0.01 0.76 ... \n", + "osm_hotels_points_in_0.01 0.71 ... \n", + "osm_leisure_points_in_0.01 0.71 ... \n", + "osm_offices_points_in_0.01 0.76 ... \n", + "osm_shops_points_in_0.01 0.79 ... \n", + "osm_subway_closest_dist 0.20 ... \n", + "osm_train_stop_closest_dist 0.09 ... \n", + "osm_train_stop_points_in_0.01 0.46 ... \n", + "osm_transport_stop_closest_dist 0.00 ... \n", + "osm_transport_stop_points_in_0.01 0.77 ... \n", + "reform_count_of_houses_1000 0.67 ... \n", + "reform_house_population_1000 0.59 ... \n", + "reform_mean_floor_count_1000 0.41 ... \n", + "reform_mean_year_building_1000 0.17 ... \n", + "region 0.55 ... \n", + "total_square 0.18 ... \n", + "date 0.17 ... \n", + "realty_type 0.31 ... \n", + "price_type 0.09 ... \n", + "\n", + " osm_transport_stop_points_in_0.01 \\\n", + "per_square_meter_price 0.46 \n", + "floor 0.12 \n", + "osm_amenity_points_in_0.01 0.69 \n", + "osm_building_points_in_0.01 0.04 \n", + "osm_catering_points_in_0.01 0.63 \n", + "osm_city_closest_dist 0.15 \n", + "osm_city_nearest_name 0.62 \n", + "osm_city_nearest_population 0.49 \n", + "osm_crossing_closest_dist 0.02 \n", + "osm_crossing_points_in_0.01 0.77 \n", + "osm_culture_points_in_0.01 0.34 \n", + "osm_healthcare_points_in_0.01 0.68 \n", + "osm_historic_points_in_0.01 0.52 \n", + "osm_hotels_points_in_0.01 0.50 \n", + "osm_leisure_points_in_0.01 0.58 \n", + "osm_offices_points_in_0.01 0.61 \n", + "osm_shops_points_in_0.01 0.67 \n", + "osm_subway_closest_dist 0.18 \n", + "osm_train_stop_closest_dist 0.06 \n", + "osm_train_stop_points_in_0.01 0.25 \n", + "osm_transport_stop_closest_dist 0.02 \n", + "osm_transport_stop_points_in_0.01 1.00 \n", + "reform_count_of_houses_1000 0.52 \n", + "reform_house_population_1000 0.45 \n", + "reform_mean_floor_count_1000 0.33 \n", + "reform_mean_year_building_1000 0.05 \n", + "region 0.52 \n", + "total_square 0.05 \n", + "date 0.08 \n", + "realty_type 0.20 \n", + "price_type 0.07 \n", + "\n", + " reform_count_of_houses_1000 \\\n", + "per_square_meter_price 0.16 \n", + "floor 0.03 \n", + "osm_amenity_points_in_0.01 0.63 \n", + "osm_building_points_in_0.01 0.03 \n", + "osm_catering_points_in_0.01 0.57 \n", + "osm_city_closest_dist 0.09 \n", + "osm_city_nearest_name 0.55 \n", + "osm_city_nearest_population 0.20 \n", + "osm_crossing_closest_dist 0.01 \n", + "osm_crossing_points_in_0.01 0.67 \n", + "osm_culture_points_in_0.01 0.37 \n", + "osm_healthcare_points_in_0.01 0.53 \n", + "osm_historic_points_in_0.01 0.52 \n", + "osm_hotels_points_in_0.01 0.52 \n", + "osm_leisure_points_in_0.01 0.40 \n", + "osm_offices_points_in_0.01 0.56 \n", + "osm_shops_points_in_0.01 0.64 \n", + "osm_subway_closest_dist 0.12 \n", + "osm_train_stop_closest_dist 0.03 \n", + "osm_train_stop_points_in_0.01 0.12 \n", + "osm_transport_stop_closest_dist 0.01 \n", + "osm_transport_stop_points_in_0.01 0.52 \n", + "reform_count_of_houses_1000 1.00 \n", + "reform_house_population_1000 0.55 \n", + "reform_mean_floor_count_1000 0.38 \n", + "reform_mean_year_building_1000 0.06 \n", + "region 0.43 \n", + "total_square 0.05 \n", + "date 0.03 \n", + "realty_type 0.13 \n", + "price_type 0.13 \n", + "\n", + " reform_house_population_1000 \\\n", + "per_square_meter_price 0.19 \n", + "floor 0.10 \n", + "osm_amenity_points_in_0.01 0.62 \n", + "osm_building_points_in_0.01 0.03 \n", + "osm_catering_points_in_0.01 0.60 \n", + "osm_city_closest_dist 0.11 \n", + "osm_city_nearest_name 0.48 \n", + "osm_city_nearest_population 0.24 \n", + "osm_crossing_closest_dist 0.00 \n", + "osm_crossing_points_in_0.01 0.59 \n", + "osm_culture_points_in_0.01 0.15 \n", + "osm_healthcare_points_in_0.01 0.42 \n", + "osm_historic_points_in_0.01 0.19 \n", + "osm_hotels_points_in_0.01 0.56 \n", + "osm_leisure_points_in_0.01 0.39 \n", + "osm_offices_points_in_0.01 0.59 \n", + "osm_shops_points_in_0.01 0.65 \n", + "osm_subway_closest_dist 0.09 \n", + "osm_train_stop_closest_dist 0.05 \n", + "osm_train_stop_points_in_0.01 0.13 \n", + "osm_transport_stop_closest_dist 0.01 \n", + "osm_transport_stop_points_in_0.01 0.45 \n", + "reform_count_of_houses_1000 0.55 \n", + "reform_house_population_1000 1.00 \n", + "reform_mean_floor_count_1000 0.31 \n", + "reform_mean_year_building_1000 0.04 \n", + "region 0.36 \n", + "total_square 0.02 \n", + "date 0.04 \n", + "realty_type 0.20 \n", + "price_type 0.16 \n", + "\n", + " reform_mean_floor_count_1000 \\\n", + "per_square_meter_price 0.23 \n", + "floor 0.15 \n", + "osm_amenity_points_in_0.01 0.25 \n", + "osm_building_points_in_0.01 0.03 \n", + "osm_catering_points_in_0.01 0.23 \n", + "osm_city_closest_dist 0.13 \n", + "osm_city_nearest_name 0.59 \n", + "osm_city_nearest_population 0.31 \n", + "osm_crossing_closest_dist 0.01 \n", + "osm_crossing_points_in_0.01 0.41 \n", + "osm_culture_points_in_0.01 0.12 \n", + "osm_healthcare_points_in_0.01 0.23 \n", + "osm_historic_points_in_0.01 0.19 \n", + "osm_hotels_points_in_0.01 0.17 \n", + "osm_leisure_points_in_0.01 0.27 \n", + "osm_offices_points_in_0.01 0.27 \n", + "osm_shops_points_in_0.01 0.25 \n", + "osm_subway_closest_dist 0.15 \n", + "osm_train_stop_closest_dist 0.07 \n", + "osm_train_stop_points_in_0.01 0.12 \n", + "osm_transport_stop_closest_dist 0.02 \n", + "osm_transport_stop_points_in_0.01 0.33 \n", + "reform_count_of_houses_1000 0.38 \n", + "reform_house_population_1000 0.31 \n", + "reform_mean_floor_count_1000 1.00 \n", + "reform_mean_year_building_1000 0.08 \n", + "region 0.47 \n", + "total_square 0.02 \n", + "date 0.06 \n", + "realty_type 0.14 \n", + "price_type 0.05 \n", + "\n", + " reform_mean_year_building_1000 region \\\n", + "per_square_meter_price 0.03 0.52 \n", + "floor 0.00 0.23 \n", + "osm_amenity_points_in_0.01 0.04 0.45 \n", + "osm_building_points_in_0.01 0.00 0.22 \n", + "osm_catering_points_in_0.01 0.03 0.40 \n", + "osm_city_closest_dist 0.00 0.40 \n", + "osm_city_nearest_name 0.44 1.00 \n", + "osm_city_nearest_population 0.05 0.97 \n", + "osm_crossing_closest_dist 0.00 0.07 \n", + "osm_crossing_points_in_0.01 0.17 0.55 \n", + "osm_culture_points_in_0.01 0.00 0.39 \n", + "osm_healthcare_points_in_0.01 0.05 0.46 \n", + "osm_historic_points_in_0.01 0.04 0.33 \n", + "osm_hotels_points_in_0.01 0.01 0.36 \n", + "osm_leisure_points_in_0.01 0.05 0.48 \n", + "osm_offices_points_in_0.01 0.04 0.47 \n", + "osm_shops_points_in_0.01 0.06 0.47 \n", + "osm_subway_closest_dist 0.04 0.91 \n", + "osm_train_stop_closest_dist 0.00 0.43 \n", + "osm_train_stop_points_in_0.01 0.05 0.26 \n", + "osm_transport_stop_closest_dist 0.00 0.14 \n", + "osm_transport_stop_points_in_0.01 0.05 0.52 \n", + "reform_count_of_houses_1000 0.06 0.43 \n", + "reform_house_population_1000 0.04 0.36 \n", + "reform_mean_floor_count_1000 0.08 0.47 \n", + "reform_mean_year_building_1000 1.00 0.37 \n", + "region 0.37 1.00 \n", + "total_square 0.00 0.08 \n", + "date 0.02 0.17 \n", + "realty_type 0.02 0.28 \n", + "price_type 0.00 0.27 \n", + "\n", + " total_square date realty_type price_type \n", + "per_square_meter_price 0.02 0.11 0.18 0.05 \n", + "floor 0.01 0.22 0.04 0.00 \n", + "osm_amenity_points_in_0.01 0.05 0.07 0.18 0.22 \n", + "osm_building_points_in_0.01 0.00 0.01 0.03 0.01 \n", + "osm_catering_points_in_0.01 0.05 0.07 0.19 0.20 \n", + "osm_city_closest_dist 0.00 0.02 0.06 0.01 \n", + "osm_city_nearest_name 0.12 0.22 0.37 0.30 \n", + "osm_city_nearest_population 0.07 0.14 0.14 0.07 \n", + "osm_crossing_closest_dist 0.00 0.00 0.01 0.00 \n", + "osm_crossing_points_in_0.01 0.18 0.17 0.31 0.09 \n", + "osm_culture_points_in_0.01 0.04 0.03 0.07 0.04 \n", + "osm_healthcare_points_in_0.01 0.06 0.07 0.13 0.04 \n", + "osm_historic_points_in_0.01 0.08 0.05 0.14 0.05 \n", + "osm_hotels_points_in_0.01 0.05 0.05 0.08 0.21 \n", + "osm_leisure_points_in_0.01 0.06 0.08 0.14 0.09 \n", + "osm_offices_points_in_0.01 0.05 0.07 0.19 0.20 \n", + "osm_shops_points_in_0.01 0.03 0.07 0.17 0.20 \n", + "osm_subway_closest_dist 0.03 0.06 0.09 0.04 \n", + "osm_train_stop_closest_dist 0.00 0.01 0.02 0.00 \n", + "osm_train_stop_points_in_0.01 0.04 0.04 0.11 0.03 \n", + "osm_transport_stop_closest_dist 0.00 0.00 0.02 0.00 \n", + "osm_transport_stop_points_in_0.01 0.05 0.08 0.20 0.07 \n", + "reform_count_of_houses_1000 0.05 0.03 0.13 0.13 \n", + "reform_house_population_1000 0.02 0.04 0.20 0.16 \n", + "reform_mean_floor_count_1000 0.02 0.06 0.14 0.05 \n", + "reform_mean_year_building_1000 0.00 0.02 0.02 0.00 \n", + "region 0.08 0.17 0.28 0.27 \n", + "total_square 1.00 0.02 0.04 0.01 \n", + "date 0.02 1.00 0.10 0.02 \n", + "realty_type 0.04 0.10 1.00 0.02 \n", + "price_type 0.01 0.02 0.02 1.00 \n", + "\n", + "[31 rows x 31 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# считаем корреляцию phik - она позволяет находитб взаимосвязи в том числе между категориальными переменными. \n", + "# направление взаимосвязи не видно, только абсолютное значение. Чем ближе к единице, тем лучше\n", + "\n", + "# выделяем интервальные переменные\n", + "interval_cols = ['osm_amenity_points_in_0.01', 'osm_building_points_in_0.01',\n", + " 'osm_catering_points_in_0.01', 'osm_city_closest_dist',\n", + " 'osm_city_nearest_population','osm_crossing_closest_dist', 'osm_crossing_points_in_0.001', 'osm_culture_points_in_0.01',\n", + " 'osm_healthcare_points_in_0.01', 'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.01',\n", + " 'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.01',\n", + " 'osm_shops_points_in_0.01', 'osm_subway_closest_dist',\n", + " 'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.01',\n", + " 'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.01', 'per_square_meter_price',\n", + " 'reform_count_of_houses_1000', 'reform_house_population_1000', \n", + " 'reform_mean_floor_count_1000', 'reform_mean_year_building_1000',\n", + " 'total_square', 'realty_type', 'price_type', 'many_floors',\n", + " 'city', 'street']\n", + "\n", + "# строим матрицу корреляции\n", + "phik_overview = train_data[columns_for_phik].phik_matrix(interval_cols=interval_cols)\n", + "\n", + "phik_overview.round(2)\n", + "\n", + "# визуализируем с помощью тепловой карты\n", + "# fig, ax = plt.subplots(figsize=(20, 20))\n", + "# sns.heatmap(phik_overview.round(2), annot=True, square=True, cmap='mako')\n", + "# ax.set_title(label = 'МАТРИЦА КОРРЕЛЯЦИИ ПРИЗНАКОВ $\\phi_K$', fontdict={'fontsize': 15, 'fontweight': 'bold'})\n", + "\n", + "# plt.tight_layout()\n", + "# plt.show();" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featurephik
0per_square_meter_price1.00
9osm_crossing_points_in_0.010.57
6osm_city_nearest_name0.57
7osm_city_nearest_population0.55
26region0.52
2osm_amenity_points_in_0.010.48
4osm_catering_points_in_0.010.48
14osm_leisure_points_in_0.010.47
15osm_offices_points_in_0.010.47
16osm_shops_points_in_0.010.47
11osm_healthcare_points_in_0.010.46
21osm_transport_stop_points_in_0.010.46
\n", + "
" + ], + "text/plain": [ + " feature phik\n", + "0 per_square_meter_price 1.00\n", + "9 osm_crossing_points_in_0.01 0.57\n", + "6 osm_city_nearest_name 0.57\n", + "7 osm_city_nearest_population 0.55\n", + "26 region 0.52\n", + "2 osm_amenity_points_in_0.01 0.48\n", + "4 osm_catering_points_in_0.01 0.48\n", + "14 osm_leisure_points_in_0.01 0.47\n", + "15 osm_offices_points_in_0.01 0.47\n", + "16 osm_shops_points_in_0.01 0.47\n", + "11 osm_healthcare_points_in_0.01 0.46\n", + "21 osm_transport_stop_points_in_0.01 0.46" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Формируем список переменных с максимальным коэффициентов корреляции phik с целевой переменной \n", + "phik_max_corr = (\n", + " phik_overview['per_square_meter_price'].to_frame().reset_index()\n", + " .rename(columns={'per_square_meter_price':'phik', 'index':'feature'})\n", + " .sort_values(by='phik', ascending=False)\n", + " .query('phik > 0.4')\n", + " .round(2)\n", + " )\n", + "phik_max_corr\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Матрица корреляции phik показала максимальную взаимосвязь целевой переменной с пременными:\n", + "- количество пешеходных переходов в радиусе 1 км\n", + "- название ближайшего города\n", + "- население ближайшего города\n", + "- регион\n", + "- количество в радиусе 1 км точек кейтеринга, досуга, офисов, магазинов, медучреждений, остановок общественного транспорта и объектов связаннных с удобством\n", + "\n", + "Также обнаружены новые взаимосвязи переменных между собой, которые не были видны на матрице корреляции Пирсона:\n", + "- этаж и price_type\n", + "- название близлежащего города и население ближайшего города, расстояние до ближайшего метро, остановки общественного транспорта, регион\n", + "- регион и расстояние до ближайшего метро" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modelling" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# add features\n", + "city_population = pd.read_csv('city_population.csv')\n", + "zarplaty = pd.read_excel('zarplaty.xlsx')\n", + "\n", + "def city_type(row):\n", + " if row >=1000000:\n", + " return \"1Million\"\n", + " elif (row<1000000)&(row >200000):\n", + " return \"Medium\"\n", + " elif (row <=200000):\n", + " return \"Small\"\n", + " \n", + "def floor_type(row):\n", + " if ('1' in str(row))&(row!=-1):\n", + " return 1\n", + " else:\n", + " return 0\n", + " \n", + "def add_features(df):\n", + " df['age'] = round(2021 - df['reform_mean_year_building_500'])\n", + " df.city = df.city.apply(lambda x: x.lower())\n", + " \n", + " city_population_clean = city_population.groupby('settlement').agg({'population':'sum'}).reset_index()\n", + " city_population_clean.columns = ['city', 'city_population']\n", + " city_population_clean['city_population']\n", + " city_population_clean.city = city_population_clean.city.apply(lambda x: x.lower())\n", + " df = df.merge(city_population_clean, on = 'city', how='left')\n", + " \n", + " for col in df.select_dtypes(include=np.number).columns:\n", + " df[col] = pd.to_numeric(df[col], downcast = 'unsigned')\n", + " \n", + " df['city_type'] = df['city_population'].apply(lambda x: city_type(x))\n", + " df.loc[df.city == 'москва', 'city_type'] = \"Capital\"\n", + " df.loc[df.city == 'санкт-Петербург', 'city_type'] = \"Capital\"\n", + " \n", + " df = df.merge(zarplaty, on = 'region', how='left')\n", + " df['zarplata'] = pd.to_numeric(df['zarplata'], downcast = 'unsigned')\n", + " df['floor_type'] = df['floor'].apply(lambda x: floor_type(x))\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = add_features(train_data)\n", + "test_data = add_features(test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((279967, 82), (2974, 81))" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_data.shape, test_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityflooridlatlngosm_amenity_points_in_0.001osm_amenity_points_in_0.005osm_amenity_points_in_0.0075osm_amenity_points_in_0.01osm_building_points_in_0.001...total_squarestreetdaterealty_typeprice_typeagecity_populationcity_typezarplatafloor_type
0пермь0COL_057.99820756.29279741935520...32.0S272892020-01-0510060.01048011.01Million41958.00
1шатура0COL_155.57428439.54383532437590...280.0S170522020-01-0510064.032885.0Small58066.00
2ярославль0COL_257.61914039.850525130671280...297.4S169132020-01-05110048.0604128.0MediumNaN0
3новокузнецк0COL_353.89708387.108604005210...190.0S101482020-01-0511007.0551919.0Medium43429.00
4москва0COL_455.80259037.487110123641530...60.2S13382020-01-0510060.012380691.0Capital100070.00
\n", + "

5 rows × 82 columns

\n", + "
" + ], + "text/plain": [ + " city floor id lat lng \\\n", + "0 пермь 0 COL_0 57.998207 56.292797 \n", + "1 шатура 0 COL_1 55.574284 39.543835 \n", + "2 ярославль 0 COL_2 57.619140 39.850525 \n", + "3 новокузнецк 0 COL_3 53.897083 87.108604 \n", + "4 москва 0 COL_4 55.802590 37.487110 \n", + "\n", + " osm_amenity_points_in_0.001 osm_amenity_points_in_0.005 \\\n", + "0 4 19 \n", + "1 3 24 \n", + "2 1 30 \n", + "3 0 0 \n", + "4 1 23 \n", + "\n", + " osm_amenity_points_in_0.0075 osm_amenity_points_in_0.01 \\\n", + "0 35 52 \n", + "1 37 59 \n", + "2 67 128 \n", + "3 5 21 \n", + "4 64 153 \n", + "\n", + " osm_building_points_in_0.001 ... total_square street date \\\n", + "0 0 ... 32.0 S27289 2020-01-05 \n", + "1 0 ... 280.0 S17052 2020-01-05 \n", + "2 0 ... 297.4 S16913 2020-01-05 \n", + "3 0 ... 190.0 S10148 2020-01-05 \n", + "4 0 ... 60.2 S1338 2020-01-05 \n", + "\n", + " realty_type price_type age city_population city_type zarplata \\\n", + "0 10 0 60.0 1048011.0 1Million 41958.0 \n", + "1 10 0 64.0 32885.0 Small 58066.0 \n", + "2 110 0 48.0 604128.0 Medium NaN \n", + "3 110 0 7.0 551919.0 Medium 43429.0 \n", + "4 10 0 60.0 12380691.0 Capital 100070.0 \n", + "\n", + " floor_type \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "\n", + "[5 rows x 82 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = train_data.query('price_type == 1')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "train_data.replace('1', 1, inplace=True)\n", + "train_data.replace('1.0', 1, inplace=True)\n", + "test_data.replace('1', 1, inplace=True)\n", + "test_data.replace('1.0', 1, inplace=True)\n", + "\n", + "train_data['floor'] = train_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)\n", + "test_data['floor'] = test_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "id": "811f6b6d" + }, + "outputs": [], + "source": [ + "N_THREADS = 4 # threads cnt for lgbm and linear models\n", + "N_FOLDS = 5 # folds cnt for AutoML\n", + "RANDOM_STATE = 42 # fixed random state for various reasons\n", + "#TEST_SIZE = 0.1 # Test size for metric check\n", + "TIMEOUT = 100 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score\n", + "\n", + "np.random.seed(RANDOM_STATE)\n", + "torch.set_num_threads(N_THREADS)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "id": "e6d51e72" + }, + "outputs": [], + "source": [ + "# train_df, test_df = train_test_split(train_data, \n", + "# test_size=TEST_SIZE, \n", + "# random_state=RANDOM_STATE)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "THRESHOLD = 0.15\n", + "NEGATIVE_WEIGHT = 1.1\n", + "\n", + "def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:\n", + " \"\"\"\n", + " Реализация кастомной метрики для хакатона.\n", + "\n", + " :param y_true: float, реальная цена\n", + " :param y_pred: float, предсказанная цена\n", + " :return: float, значение метрики\n", + " \"\"\"\n", + " deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)\n", + " if np.abs(deviation) <= THRESHOLD:\n", + " return 0\n", + " elif deviation <= - 4 * THRESHOLD:\n", + " return 9 * NEGATIVE_WEIGHT\n", + " elif deviation < -THRESHOLD:\n", + " return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2\n", + " elif deviation < 4 * THRESHOLD:\n", + " return ((deviation / THRESHOLD) - 1) ** 2\n", + " else:\n", + " return 9\n", + "\n", + "\n", + "def deviation_metric(y_true: np.array, y_pred: np.array) -> float:\n", + " return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "id": "11901a60" + }, + "outputs": [], + "source": [ + "task = Task('reg', loss = 'rmsle', metric = deviation_metric)\n", + "\n", + "roles = {\n", + " 'target': 'per_square_meter_price',\n", + " 'drop': 'id'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 800 + }, + "id": "c77216e8", + "outputId": "ae6a7952-b341-40ed-f000-00f38639be74" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}\n", + "Found reader_params in kwargs, need to combine\n", + "Merged variant for reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 42}\n", + "Start automl preset with listed constraints:\n", + "- time: 100.0 seconds\n", + "- cpus: 4 cores\n", + "- memory: 16 gb\n", + "\n", + "Train data shape: (4493, 82)\n", + "Feats was rejected during automatic roles guess: []\n", + "\n", + "\n", + "Layer 1 ...\n", + "Train process start. Time left 89.13144850730896 secs\n", + "Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...\n", + "\n", + "===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", + "\n", + "Linear model: C = 1e-05 score = [-3.7152774]\n", + "Linear model: C = 5e-05 score = [-3.7152774]\n", + "Linear model: C = 0.0001 score = [-3.7152774]\n", + "\n", + "===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", + "\n", + "Linear model: C = 1e-05 score = [-3.3165922]\n", + "Linear model: C = 5e-05 score = [-3.3165922]\n", + "Linear model: C = 0.0001 score = [-3.3165922]\n", + "\n", + "===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", + "\n", + "Linear model: C = 1e-05 score = [-3.6910217]\n", + "Linear model: C = 5e-05 score = [-3.6910217]\n", + "Linear model: C = 0.0001 score = [-3.6910217]\n", + "\n", + "===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", + "\n", + "Linear model: C = 1e-05 score = [-3.6205108]\n", + "Linear model: C = 5e-05 score = [-3.6205108]\n", + "Linear model: C = 0.0001 score = [-3.6205108]\n", + "\n", + "===== Start working with fold 4 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", + "\n", + "Linear model: C = 1e-05 score = [-3.2158222]\n", + "Linear model: C = 5e-05 score = [-3.2158222]\n", + "Linear model: C = 0.0001 score = [-3.2158222]\n", + "Lvl_0_Pipe_0_Mod_0_LinearL2 fitting and predicting completed\n", + "Time left 86.83387517929077\n", + "Start fitting Lvl_0_Pipe_1_Mod_0_LightGBM ...\n", + "\n", + "===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_0_LightGBM =====\n", + "\n", + "Training until validation scores don't improve for 200 rounds\n", + "[100]\tvalid's l2: 0.175464\tvalid's Opt metric: 2.38008\n", + "[200]\tvalid's l2: 0.137539\tvalid's Opt metric: 2.00419\n", + "[300]\tvalid's l2: 0.126562\tvalid's Opt metric: 1.85852\n", + "[400]\tvalid's l2: 0.122176\tvalid's Opt metric: 1.77832\n", + "[500]\tvalid's l2: 0.120404\tvalid's Opt metric: 1.74085\n", + "[600]\tvalid's l2: 0.119403\tvalid's Opt metric: 1.71643\n", + "[700]\tvalid's l2: 0.118939\tvalid's Opt metric: 1.7027\n", + "[800]\tvalid's l2: 0.1189\tvalid's Opt metric: 1.69712\n", + "[900]\tvalid's l2: 0.118543\tvalid's Opt metric: 1.68706\n", + "[1000]\tvalid's l2: 0.11832\tvalid's Opt metric: 1.68094\n", + "[1100]\tvalid's l2: 0.118228\tvalid's Opt metric: 1.67885\n", + "[1200]\tvalid's l2: 0.118142\tvalid's Opt metric: 1.67488\n", + "[1300]\tvalid's l2: 0.117963\tvalid's Opt metric: 1.66822\n", + "[1400]\tvalid's l2: 0.117946\tvalid's Opt metric: 1.66472\n", + "[1500]\tvalid's l2: 0.118028\tvalid's Opt metric: 1.66498\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Time limit exceeded after calculating fold 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Early stopping, best iteration is:\n", + "[1336]\tvalid's l2: 0.117866\tvalid's Opt metric: 1.6654\n", + "Lvl_0_Pipe_1_Mod_0_LightGBM fitting and predicting completed\n", + "Start fitting Lvl_0_Pipe_1_Mod_2_CatBoost ...\n", + "\n", + "===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5690497\ttest: 0.5902307\tbest: 0.5902307 (0)\ttotal: 9.89ms\tremaining: 19.8s\n", + "100:\tlearn: 0.3220134\ttest: 0.3603304\tbest: 0.3603304 (100)\ttotal: 241ms\tremaining: 4.53s\n", + "200:\tlearn: 0.2853510\ttest: 0.3517094\tbest: 0.3517094 (200)\ttotal: 498ms\tremaining: 4.46s\n", + "300:\tlearn: 0.2576051\ttest: 0.3490464\tbest: 0.3490393 (299)\ttotal: 737ms\tremaining: 4.16s\n", + "400:\tlearn: 0.2379688\ttest: 0.3474727\tbest: 0.3474727 (400)\ttotal: 1s\tremaining: 4s\n", + "500:\tlearn: 0.2222526\ttest: 0.3449390\tbest: 0.3449390 (500)\ttotal: 1.23s\tremaining: 3.67s\n", + "600:\tlearn: 0.2078858\ttest: 0.3447732\tbest: 0.3443742 (549)\ttotal: 1.49s\tremaining: 3.46s\n", + "700:\tlearn: 0.1960459\ttest: 0.3442549\tbest: 0.3441099 (652)\ttotal: 1.75s\tremaining: 3.24s\n", + "800:\tlearn: 0.1857116\ttest: 0.3438032\tbest: 0.3436641 (740)\ttotal: 2s\tremaining: 2.99s\n", + "900:\tlearn: 0.1759209\ttest: 0.3433300\tbest: 0.3432126 (844)\ttotal: 2.22s\tremaining: 2.71s\n", + "1000:\tlearn: 0.1668783\ttest: 0.3428658\tbest: 0.3428565 (990)\ttotal: 2.47s\tremaining: 2.46s\n", + "1100:\tlearn: 0.1586743\ttest: 0.3419130\tbest: 0.3418887 (1090)\ttotal: 2.69s\tremaining: 2.2s\n", + "1200:\tlearn: 0.1510258\ttest: 0.3414564\tbest: 0.3414564 (1200)\ttotal: 2.94s\tremaining: 1.96s\n", + "1300:\tlearn: 0.1441223\ttest: 0.3414049\tbest: 0.3409616 (1261)\ttotal: 3.18s\tremaining: 1.71s\n", + "1400:\tlearn: 0.1377195\ttest: 0.3414637\tbest: 0.3409616 (1261)\ttotal: 3.44s\tremaining: 1.47s\n", + "1500:\tlearn: 0.1316497\ttest: 0.3414028\tbest: 0.3409616 (1261)\ttotal: 3.67s\tremaining: 1.22s\n", + "Stopped by overfitting detector (300 iterations wait)\n", + "\n", + "bestTest = 0.3409616327\n", + "bestIteration = 1261\n", + "\n", + "Shrink model to first 1262 iterations.\n", + "\n", + "===== Start working with fold 1 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5768234\ttest: 0.5574083\tbest: 0.5574083 (0)\ttotal: 3.21ms\tremaining: 6.42s\n", + "100:\tlearn: 0.3247511\ttest: 0.3336189\tbest: 0.3336189 (100)\ttotal: 246ms\tremaining: 4.62s\n", + "200:\tlearn: 0.2892813\ttest: 0.3252905\tbest: 0.3252353 (193)\ttotal: 536ms\tremaining: 4.79s\n", + "300:\tlearn: 0.2614113\ttest: 0.3229165\tbest: 0.3227958 (299)\ttotal: 767ms\tremaining: 4.33s\n", + "400:\tlearn: 0.2413630\ttest: 0.3221904\tbest: 0.3219651 (324)\ttotal: 1.02s\tremaining: 4.08s\n", + "500:\tlearn: 0.2248640\ttest: 0.3214037\tbest: 0.3211377 (457)\ttotal: 1.27s\tremaining: 3.81s\n", + "600:\tlearn: 0.2108310\ttest: 0.3208134\tbest: 0.3207517 (597)\ttotal: 1.56s\tremaining: 3.62s\n", + "700:\tlearn: 0.1988404\ttest: 0.3209918\tbest: 0.3207120 (654)\ttotal: 1.8s\tremaining: 3.33s\n", + "800:\tlearn: 0.1880008\ttest: 0.3214166\tbest: 0.3207120 (654)\ttotal: 2.05s\tremaining: 3.06s\n", + "900:\tlearn: 0.1778877\ttest: 0.3214287\tbest: 0.3207120 (654)\ttotal: 2.35s\tremaining: 2.86s\n", + "Stopped by overfitting detector (300 iterations wait)\n", + "\n", + "bestTest = 0.320711959\n", + "bestIteration = 654\n", + "\n", + "Shrink model to first 655 iterations.\n", + "\n", + "===== Start working with fold 2 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5724626\ttest: 0.5753481\tbest: 0.5753481 (0)\ttotal: 3.17ms\tremaining: 6.33s\n", + "100:\tlearn: 0.3239383\ttest: 0.3399819\tbest: 0.3399819 (100)\ttotal: 266ms\tremaining: 5s\n", + "200:\tlearn: 0.2867294\ttest: 0.3285307\tbest: 0.3285307 (200)\ttotal: 512ms\tremaining: 4.58s\n", + "300:\tlearn: 0.2589989\ttest: 0.3255054\tbest: 0.3254546 (298)\ttotal: 750ms\tremaining: 4.23s\n", + "400:\tlearn: 0.2388187\ttest: 0.3247287\tbest: 0.3245618 (398)\ttotal: 976ms\tremaining: 3.89s\n", + "500:\tlearn: 0.2222920\ttest: 0.3240440\tbest: 0.3240057 (499)\ttotal: 1.24s\tremaining: 3.7s\n", + "600:\tlearn: 0.2086809\ttest: 0.3226891\tbest: 0.3226891 (600)\ttotal: 1.49s\tremaining: 3.46s\n", + "700:\tlearn: 0.1960510\ttest: 0.3210332\tbest: 0.3210024 (698)\ttotal: 1.74s\tremaining: 3.22s\n", + "800:\tlearn: 0.1854055\ttest: 0.3204972\tbest: 0.3204972 (800)\ttotal: 2s\tremaining: 2.99s\n", + "900:\tlearn: 0.1764789\ttest: 0.3205735\tbest: 0.3200469 (851)\ttotal: 2.28s\tremaining: 2.78s\n", + "1000:\tlearn: 0.1678033\ttest: 0.3209024\tbest: 0.3200469 (851)\ttotal: 2.52s\tremaining: 2.52s\n", + "1100:\tlearn: 0.1599763\ttest: 0.3211878\tbest: 0.3200469 (851)\ttotal: 2.79s\tremaining: 2.28s\n", + "Stopped by overfitting detector (300 iterations wait)\n", + "\n", + "bestTest = 0.3200468666\n", + "bestIteration = 851\n", + "\n", + "Shrink model to first 852 iterations.\n", + "\n", + "===== Start working with fold 3 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5652387\ttest: 0.6041895\tbest: 0.6041895 (0)\ttotal: 3.64ms\tremaining: 7.28s\n", + "100:\tlearn: 0.3086002\ttest: 0.3976876\tbest: 0.3976876 (100)\ttotal: 264ms\tremaining: 4.97s\n", + "200:\tlearn: 0.2756635\ttest: 0.3880626\tbest: 0.3880626 (200)\ttotal: 501ms\tremaining: 4.49s\n", + "300:\tlearn: 0.2511624\ttest: 0.3852988\tbest: 0.3849651 (283)\ttotal: 754ms\tremaining: 4.25s\n", + "400:\tlearn: 0.2321052\ttest: 0.3859294\tbest: 0.3849651 (283)\ttotal: 997ms\tremaining: 3.97s\n", + "500:\tlearn: 0.2175329\ttest: 0.3865094\tbest: 0.3849651 (283)\ttotal: 1.26s\tremaining: 3.77s\n", + "Stopped by overfitting detector (300 iterations wait)\n", + "\n", + "bestTest = 0.3849650533\n", + "bestIteration = 283\n", + "\n", + "Shrink model to first 284 iterations.\n", + "\n", + "===== Start working with fold 4 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5819025\ttest: 0.5416817\tbest: 0.5416817 (0)\ttotal: 16.8ms\tremaining: 33.5s\n", + "100:\tlearn: 0.3282525\ttest: 0.3302178\tbest: 0.3302178 (100)\ttotal: 292ms\tremaining: 5.5s\n", + "200:\tlearn: 0.2935099\ttest: 0.3155985\tbest: 0.3154939 (198)\ttotal: 559ms\tremaining: 5s\n", + "300:\tlearn: 0.2640830\ttest: 0.3098426\tbest: 0.3098426 (300)\ttotal: 817ms\tremaining: 4.61s\n", + "400:\tlearn: 0.2425758\ttest: 0.3053681\tbest: 0.3053681 (400)\ttotal: 1.08s\tremaining: 4.31s\n", + "500:\tlearn: 0.2263103\ttest: 0.3027193\tbest: 0.3024671 (488)\ttotal: 1.35s\tremaining: 4.05s\n", + "600:\tlearn: 0.2119579\ttest: 0.3012696\tbest: 0.3011144 (582)\ttotal: 1.62s\tremaining: 3.78s\n", + "700:\tlearn: 0.1987671\ttest: 0.2997913\tbest: 0.2997913 (700)\ttotal: 1.89s\tremaining: 3.5s\n", + "800:\tlearn: 0.1877085\ttest: 0.2993222\tbest: 0.2991978 (796)\ttotal: 2.15s\tremaining: 3.21s\n", + "900:\tlearn: 0.1779556\ttest: 0.2988217\tbest: 0.2988217 (900)\ttotal: 2.41s\tremaining: 2.94s\n", + "1000:\tlearn: 0.1691039\ttest: 0.2991610\tbest: 0.2988179 (904)\ttotal: 2.67s\tremaining: 2.66s\n", + "1100:\tlearn: 0.1609237\ttest: 0.2994798\tbest: 0.2988179 (904)\ttotal: 2.92s\tremaining: 2.38s\n", + "1200:\tlearn: 0.1537691\ttest: 0.2996605\tbest: 0.2988179 (904)\ttotal: 3.17s\tremaining: 2.11s\n", + "Stopped by overfitting detector (300 iterations wait)\n", + "\n", + "bestTest = 0.2988178835\n", + "bestIteration = 904\n", + "\n", + "Shrink model to first 905 iterations.\n", + "Lvl_0_Pipe_1_Mod_2_CatBoost fitting and predicting completed\n", + "Optuna may run 1 secs\n", + "Start fitting Lvl_0_Pipe_1_Mod_3_CatBoost ...\n", + "\n", + "===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5696811\ttest: 0.5907190\tbest: 0.5907190 (0)\ttotal: 4.77ms\tremaining: 9.53s\n", + "100:\tlearn: 0.3328412\ttest: 0.3613108\tbest: 0.3613108 (100)\ttotal: 196ms\tremaining: 3.68s\n", + "200:\tlearn: 0.3027316\ttest: 0.3519878\tbest: 0.3519878 (200)\ttotal: 368ms\tremaining: 3.29s\n", + "300:\tlearn: 0.2794207\ttest: 0.3487270\tbest: 0.3487270 (300)\ttotal: 582ms\tremaining: 3.29s\n", + "400:\tlearn: 0.2634165\ttest: 0.3479933\tbest: 0.3479933 (400)\ttotal: 775ms\tremaining: 3.09s\n", + "500:\tlearn: 0.2504892\ttest: 0.3473465\tbest: 0.3470908 (452)\ttotal: 961ms\tremaining: 2.88s\n", + "600:\tlearn: 0.2399381\ttest: 0.3461447\tbest: 0.3458834 (593)\ttotal: 1.17s\tremaining: 2.72s\n", + "700:\tlearn: 0.2293243\ttest: 0.3446773\tbest: 0.3445460 (694)\ttotal: 1.35s\tremaining: 2.5s\n", + "800:\tlearn: 0.2201892\ttest: 0.3445020\tbest: 0.3441724 (728)\ttotal: 1.56s\tremaining: 2.34s\n", + "900:\tlearn: 0.2122752\ttest: 0.3439144\tbest: 0.3438057 (883)\ttotal: 1.76s\tremaining: 2.15s\n", + "1000:\tlearn: 0.2050941\ttest: 0.3442635\tbest: 0.3438057 (883)\ttotal: 1.95s\tremaining: 1.95s\n", + "1100:\tlearn: 0.1984862\ttest: 0.3437866\tbest: 0.3437749 (1051)\ttotal: 2.14s\tremaining: 1.75s\n", + "1200:\tlearn: 0.1924283\ttest: 0.3433972\tbest: 0.3433483 (1198)\ttotal: 2.36s\tremaining: 1.57s\n", + "1300:\tlearn: 0.1863886\ttest: 0.3431625\tbest: 0.3430420 (1288)\ttotal: 2.56s\tremaining: 1.38s\n", + "1400:\tlearn: 0.1807727\ttest: 0.3427477\tbest: 0.3427039 (1394)\ttotal: 2.74s\tremaining: 1.17s\n", + "1500:\tlearn: 0.1756772\ttest: 0.3423999\tbest: 0.3423999 (1500)\ttotal: 2.95s\tremaining: 979ms\n", + "1600:\tlearn: 0.1709954\ttest: 0.3426386\tbest: 0.3421255 (1541)\ttotal: 3.12s\tremaining: 779ms\n", + "1700:\tlearn: 0.1662324\ttest: 0.3423893\tbest: 0.3421255 (1541)\ttotal: 3.35s\tremaining: 589ms\n", + "1800:\tlearn: 0.1620149\ttest: 0.3419531\tbest: 0.3419456 (1796)\ttotal: 3.52s\tremaining: 390ms\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1900:\tlearn: 0.1579787\ttest: 0.3420298\tbest: 0.3418931 (1815)\ttotal: 3.71s\tremaining: 193ms\n", + "1999:\tlearn: 0.1538067\ttest: 0.3423697\tbest: 0.3418931 (1815)\ttotal: 3.89s\tremaining: 0us\n", + "\n", + "bestTest = 0.3418931473\n", + "bestIteration = 1815\n", + "\n", + "Shrink model to first 1816 iterations.\n", + "Lvl_0_Pipe_1_Mod_3_CatBoost fitting and predicting completed\n", + "Start fitting Lvl_0_Pipe_1_Mod_3_CatBoost ...\n", + "\n", + "===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5749238\ttest: 0.5962756\tbest: 0.5962756 (0)\ttotal: 3.23ms\tremaining: 9.69s\n", + "100:\tlearn: 0.3538624\ttest: 0.3750769\tbest: 0.3750769 (100)\ttotal: 205ms\tremaining: 5.9s\n", + "200:\tlearn: 0.3247688\ttest: 0.3579003\tbest: 0.3579003 (200)\ttotal: 397ms\tremaining: 5.53s\n", + "300:\tlearn: 0.3067646\ttest: 0.3529264\tbest: 0.3529264 (300)\ttotal: 596ms\tremaining: 5.34s\n", + "400:\tlearn: 0.2906869\ttest: 0.3494385\tbest: 0.3494385 (400)\ttotal: 773ms\tremaining: 5.01s\n", + "500:\tlearn: 0.2787513\ttest: 0.3480935\tbest: 0.3480935 (500)\ttotal: 940ms\tremaining: 4.69s\n", + "600:\tlearn: 0.2682686\ttest: 0.3470561\tbest: 0.3470561 (600)\ttotal: 1.14s\tremaining: 4.56s\n", + "700:\tlearn: 0.2590796\ttest: 0.3463205\tbest: 0.3463073 (694)\ttotal: 1.33s\tremaining: 4.36s\n", + "800:\tlearn: 0.2514009\ttest: 0.3451208\tbest: 0.3451208 (800)\ttotal: 1.51s\tremaining: 4.16s\n", + "900:\tlearn: 0.2444905\ttest: 0.3440930\tbest: 0.3440841 (895)\ttotal: 1.71s\tremaining: 3.98s\n", + "1000:\tlearn: 0.2378982\ttest: 0.3435150\tbest: 0.3434602 (992)\ttotal: 1.9s\tremaining: 3.79s\n", + "1100:\tlearn: 0.2322087\ttest: 0.3431667\tbest: 0.3431153 (1075)\ttotal: 2.1s\tremaining: 3.63s\n", + "1200:\tlearn: 0.2265210\ttest: 0.3425909\tbest: 0.3425467 (1188)\ttotal: 2.29s\tremaining: 3.42s\n", + "1300:\tlearn: 0.2213036\ttest: 0.3420838\tbest: 0.3420157 (1288)\ttotal: 2.46s\tremaining: 3.21s\n", + "1400:\tlearn: 0.2163428\ttest: 0.3418436\tbest: 0.3417805 (1394)\ttotal: 2.67s\tremaining: 3.05s\n", + "1500:\tlearn: 0.2115906\ttest: 0.3414778\tbest: 0.3414778 (1500)\ttotal: 2.88s\tremaining: 2.87s\n", + "1600:\tlearn: 0.2070074\ttest: 0.3410625\tbest: 0.3410625 (1600)\ttotal: 3.09s\tremaining: 2.7s\n", + "1700:\tlearn: 0.2026507\ttest: 0.3405289\tbest: 0.3405001 (1679)\ttotal: 3.27s\tremaining: 2.5s\n", + "1800:\tlearn: 0.1988309\ttest: 0.3401400\tbest: 0.3401251 (1798)\ttotal: 3.48s\tremaining: 2.31s\n", + "1900:\tlearn: 0.1952427\ttest: 0.3398704\tbest: 0.3396758 (1880)\ttotal: 3.68s\tremaining: 2.13s\n", + "2000:\tlearn: 0.1915851\ttest: 0.3396242\tbest: 0.3396191 (1999)\ttotal: 3.88s\tremaining: 1.94s\n", + "2100:\tlearn: 0.1882232\ttest: 0.3395831\tbest: 0.3395527 (2074)\ttotal: 4.07s\tremaining: 1.74s\n", + "2200:\tlearn: 0.1848821\ttest: 0.3393809\tbest: 0.3393421 (2165)\ttotal: 4.31s\tremaining: 1.56s\n", + "2300:\tlearn: 0.1817400\ttest: 0.3391877\tbest: 0.3391877 (2300)\ttotal: 4.5s\tremaining: 1.37s\n", + "2400:\tlearn: 0.1787974\ttest: 0.3391625\tbest: 0.3391625 (2400)\ttotal: 4.69s\tremaining: 1.17s\n", + "2500:\tlearn: 0.1756702\ttest: 0.3388642\tbest: 0.3388608 (2499)\ttotal: 4.88s\tremaining: 973ms\n", + "2600:\tlearn: 0.1728922\ttest: 0.3389253\tbest: 0.3388231 (2508)\ttotal: 5.09s\tremaining: 781ms\n", + "Stopped by overfitting detector (100 iterations wait)\n", + "\n", + "bestTest = 0.3388231235\n", + "bestIteration = 2508\n", + "\n", + "Shrink model to first 2509 iterations.\n", + "\n", + "===== Start working with fold 1 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5830078\ttest: 0.5633221\tbest: 0.5633221 (0)\ttotal: 8.6ms\tremaining: 25.8s\n", + "100:\tlearn: 0.3589969\ttest: 0.3533597\tbest: 0.3533597 (100)\ttotal: 196ms\tremaining: 5.61s\n", + "200:\tlearn: 0.3307810\ttest: 0.3362463\tbest: 0.3362463 (200)\ttotal: 377ms\tremaining: 5.25s\n", + "300:\tlearn: 0.3120042\ttest: 0.3301034\tbest: 0.3301034 (300)\ttotal: 551ms\tremaining: 4.94s\n", + "400:\tlearn: 0.2950774\ttest: 0.3273159\tbest: 0.3273074 (398)\ttotal: 732ms\tremaining: 4.74s\n", + "500:\tlearn: 0.2822802\ttest: 0.3260960\tbest: 0.3260960 (500)\ttotal: 908ms\tremaining: 4.53s\n", + "600:\tlearn: 0.2718319\ttest: 0.3244773\tbest: 0.3244541 (597)\ttotal: 1.11s\tremaining: 4.42s\n", + "700:\tlearn: 0.2631367\ttest: 0.3236967\tbest: 0.3236967 (700)\ttotal: 1.29s\tremaining: 4.23s\n", + "800:\tlearn: 0.2555498\ttest: 0.3241221\tbest: 0.3236444 (718)\ttotal: 1.48s\tremaining: 4.05s\n", + "Stopped by overfitting detector (100 iterations wait)\n", + "\n", + "bestTest = 0.3236443836\n", + "bestIteration = 718\n", + "\n", + "Shrink model to first 719 iterations.\n", + "\n", + "===== Start working with fold 2 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5784948\ttest: 0.5817224\tbest: 0.5817224 (0)\ttotal: 15.4ms\tremaining: 46.3s\n", + "100:\tlearn: 0.3581544\ttest: 0.3571058\tbest: 0.3571058 (100)\ttotal: 211ms\tremaining: 6.05s\n", + "200:\tlearn: 0.3282811\ttest: 0.3383338\tbest: 0.3383338 (200)\ttotal: 395ms\tremaining: 5.5s\n", + "300:\tlearn: 0.3097713\ttest: 0.3322135\tbest: 0.3320776 (297)\ttotal: 573ms\tremaining: 5.14s\n", + "400:\tlearn: 0.2938576\ttest: 0.3299511\tbest: 0.3299511 (400)\ttotal: 769ms\tremaining: 4.98s\n", + "500:\tlearn: 0.2805774\ttest: 0.3287883\tbest: 0.3287883 (500)\ttotal: 946ms\tremaining: 4.72s\n", + "600:\tlearn: 0.2703739\ttest: 0.3269768\tbest: 0.3269716 (599)\ttotal: 1.13s\tremaining: 4.5s\n", + "700:\tlearn: 0.2616548\ttest: 0.3258652\tbest: 0.3257247 (696)\ttotal: 1.31s\tremaining: 4.29s\n", + "800:\tlearn: 0.2540794\ttest: 0.3248115\tbest: 0.3246713 (780)\ttotal: 1.49s\tremaining: 4.09s\n", + "900:\tlearn: 0.2470055\ttest: 0.3240086\tbest: 0.3239652 (896)\ttotal: 1.68s\tremaining: 3.9s\n", + "1000:\tlearn: 0.2407159\ttest: 0.3232904\tbest: 0.3232768 (993)\ttotal: 1.87s\tremaining: 3.74s\n", + "1100:\tlearn: 0.2349685\ttest: 0.3226389\tbest: 0.3224739 (1072)\ttotal: 2.08s\tremaining: 3.58s\n", + "1200:\tlearn: 0.2291905\ttest: 0.3219706\tbest: 0.3219349 (1195)\ttotal: 2.28s\tremaining: 3.42s\n", + "1300:\tlearn: 0.2242843\ttest: 0.3216330\tbest: 0.3216282 (1299)\ttotal: 2.48s\tremaining: 3.24s\n", + "1400:\tlearn: 0.2193816\ttest: 0.3213742\tbest: 0.3211652 (1345)\ttotal: 2.71s\tremaining: 3.09s\n", + "1500:\tlearn: 0.2150231\ttest: 0.3210364\tbest: 0.3209320 (1480)\ttotal: 2.9s\tremaining: 2.9s\n", + "1600:\tlearn: 0.2108812\ttest: 0.3206202\tbest: 0.3206202 (1600)\ttotal: 3.08s\tremaining: 2.69s\n", + "1700:\tlearn: 0.2066767\ttest: 0.3199278\tbest: 0.3199118 (1699)\ttotal: 3.27s\tremaining: 2.5s\n", + "1800:\tlearn: 0.2027083\ttest: 0.3198993\tbest: 0.3198815 (1726)\ttotal: 3.48s\tremaining: 2.31s\n", + "Stopped by overfitting detector (100 iterations wait)\n", + "\n", + "bestTest = 0.3198814937\n", + "bestIteration = 1726\n", + "\n", + "Shrink model to first 1727 iterations.\n", + "\n", + "===== Start working with fold 3 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5710290\ttest: 0.6096694\tbest: 0.6096694 (0)\ttotal: 2.86ms\tremaining: 8.58s\n", + "100:\tlearn: 0.3444958\ttest: 0.4152803\tbest: 0.4152803 (100)\ttotal: 224ms\tremaining: 6.42s\n", + "200:\tlearn: 0.3159040\ttest: 0.3996337\tbest: 0.3996337 (200)\ttotal: 420ms\tremaining: 5.85s\n", + "300:\tlearn: 0.2999551\ttest: 0.3924528\tbest: 0.3924528 (300)\ttotal: 601ms\tremaining: 5.38s\n", + "400:\tlearn: 0.2851919\ttest: 0.3881614\tbest: 0.3881614 (400)\ttotal: 768ms\tremaining: 4.98s\n", + "500:\tlearn: 0.2728422\ttest: 0.3868374\tbest: 0.3867807 (490)\ttotal: 936ms\tremaining: 4.67s\n", + "600:\tlearn: 0.2631856\ttest: 0.3861455\tbest: 0.3861455 (600)\ttotal: 1.1s\tremaining: 4.4s\n", + "700:\tlearn: 0.2550000\ttest: 0.3857666\tbest: 0.3856754 (668)\ttotal: 1.29s\tremaining: 4.25s\n", + "800:\tlearn: 0.2474017\ttest: 0.3856947\tbest: 0.3856013 (792)\ttotal: 1.49s\tremaining: 4.09s\n", + "900:\tlearn: 0.2405549\ttest: 0.3855558\tbest: 0.3855096 (888)\ttotal: 1.68s\tremaining: 3.9s\n", + "Stopped by overfitting detector (100 iterations wait)\n", + "\n", + "bestTest = 0.3855095628\n", + "bestIteration = 888\n", + "\n", + "Shrink model to first 889 iterations.\n", + "\n", + "===== Start working with fold 4 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n", + "\n", + "0:\tlearn: 0.5877937\ttest: 0.5433457\tbest: 0.5433457 (0)\ttotal: 2.46ms\tremaining: 7.39s\n", + "100:\tlearn: 0.3635841\ttest: 0.3502432\tbest: 0.3502432 (100)\ttotal: 205ms\tremaining: 5.9s\n", + "200:\tlearn: 0.3350821\ttest: 0.3277620\tbest: 0.3277620 (200)\ttotal: 394ms\tremaining: 5.48s\n", + "300:\tlearn: 0.3163325\ttest: 0.3186592\tbest: 0.3186268 (298)\ttotal: 581ms\tremaining: 5.21s\n", + "400:\tlearn: 0.2991828\ttest: 0.3114547\tbest: 0.3114547 (400)\ttotal: 765ms\tremaining: 4.96s\n", + "500:\tlearn: 0.2853312\ttest: 0.3067744\tbest: 0.3067744 (500)\ttotal: 961ms\tremaining: 4.79s\n", + "600:\tlearn: 0.2746053\ttest: 0.3043128\tbest: 0.3043128 (600)\ttotal: 1.19s\tremaining: 4.75s\n", + "700:\tlearn: 0.2651369\ttest: 0.3024191\tbest: 0.3024121 (699)\ttotal: 1.4s\tremaining: 4.6s\n", + "800:\tlearn: 0.2573767\ttest: 0.3007672\tbest: 0.3007672 (800)\ttotal: 1.61s\tremaining: 4.43s\n", + "900:\tlearn: 0.2503800\ttest: 0.2996296\tbest: 0.2996044 (890)\ttotal: 1.85s\tremaining: 4.3s\n", + "1000:\tlearn: 0.2433897\ttest: 0.2990408\tbest: 0.2990408 (1000)\ttotal: 2.05s\tremaining: 4.1s\n", + "1100:\tlearn: 0.2373764\ttest: 0.2977549\tbest: 0.2977112 (1099)\ttotal: 2.24s\tremaining: 3.87s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1200:\tlearn: 0.2311849\ttest: 0.2972003\tbest: 0.2971525 (1196)\ttotal: 2.42s\tremaining: 3.63s\n", + "1300:\tlearn: 0.2256356\ttest: 0.2963918\tbest: 0.2963918 (1300)\ttotal: 2.62s\tremaining: 3.42s\n", + "1400:\tlearn: 0.2204745\ttest: 0.2958023\tbest: 0.2958023 (1400)\ttotal: 2.81s\tremaining: 3.2s\n", + "1500:\tlearn: 0.2158370\ttest: 0.2953656\tbest: 0.2953011 (1496)\ttotal: 2.98s\tremaining: 2.97s\n", + "1600:\tlearn: 0.2115284\ttest: 0.2947484\tbest: 0.2947203 (1567)\ttotal: 3.14s\tremaining: 2.75s\n", + "1700:\tlearn: 0.2073122\ttest: 0.2944945\tbest: 0.2944290 (1697)\ttotal: 3.37s\tremaining: 2.57s\n", + "Stopped by overfitting detector (100 iterations wait)\n", + "\n", + "bestTest = 0.2944289789\n", + "bestIteration = 1697\n", + "\n", + "Shrink model to first 1698 iterations.\n", + "Lvl_0_Pipe_1_Mod_3_CatBoost fitting and predicting completed\n", + "Time left 37.47469425201416\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Time limit exceeded in one of the tasks. AutoML will blend level 1 models.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Blending: Optimization starts with equal weights and score -1.7954214419609544\n", + "Blending, iter 0: score = -1.4724501895733335, weights = [0. 0.86662203 0.05572809 0.07764989]\n", + "Blending, iter 1: score = -1.4661120519816215, weights = [0. 0.7553367 0.10243508 0.14222825]\n", + "Blending, iter 2: score = -1.4661120519816215, weights = [0. 0.7553367 0.10243508 0.14222825]\n", + "No score update. Terminated\n", + "\n", + "Automl preset training completed in 66.05 seconds.\n" + ] + } + ], + "source": [ + "automl = TabularUtilizedAutoML(task = task,\n", + " timeout = TIMEOUT,\n", + " cpu_limit = N_THREADS,\n", + " reader_params = {'n_jobs': N_THREADS,\n", + " 'cv': N_FOLDS,\n", + " 'random_state': RANDOM_STATE})\n", + "\n", + "oof_pred = automl.fit_predict(train_data, roles = roles)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "#deviation_metric(np.array(train_df['per_square_meter_price']), oof_pred.data[:, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "output['per_square_meter_price'] = output['per_square_meter_price'] * 0.9\n", + "\n", + "output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \\\n", + " = output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "id": "e0d6dfe7" + }, + "outputs": [], + "source": [ + "output = pd.DataFrame({'id': test_data['id'],\n", + " 'per_square_meter_price': automl.predict(test_data).data[:, 0]})\n", + "output['per_square_meter_price'] = output['per_square_meter_price'] * 0.9\n", + "\n", + "output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \\\n", + " = output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9\n", + "\n", + "output.to_csv('raifHack_ki7.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2974, 2)" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "housePrice_AutoML.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Yapiki_publ/Read me.txt b/Yapiki_publ/Read me.txt new file mode 100644 index 0000000..4f3107e --- /dev/null +++ b/Yapiki_publ/Read me.txt @@ -0,0 +1,39 @@ +Решение основано на алгоритме LightAutoML с дополнительным обогащением данных из открытых источников (Росстат). + + +- Проведен EDA инструментами pandas_profile и сравнениями корреляции метрик разными методами в зависимости от фичей - +Pandas profiling - https://drive.google.com/file/d/1xQl3LvpX9J0G6gJoaBjzRcBFKZi6QZXz/view?usp=sharing +- Введены дополнительные метрики: премиальность этажей и типирование города +- Использованы дополнительные стат данные (среднемесячная заработная плата и численность населения по регионам, Источник: Росстат); +- Учтен дисконт, закладываемый при ручном расчете (использовали вариант из публичного решения, ссылка на github: https://github.com/BatyaZhizni/Raifhack-DS) + + +Для обогащения данных использованы дополнительные датасеты: + + +1. zarplaty.xlsx - Среднемесячная номинальная начисленная заработная плата работников в целом по экономике по субъектам Российской Федерации за 2000-2020 гг. Источник: Росстат, https://rosstat.gov.ru/labor_market_employment_salaries Ссылка - https://docs.google.com/spreadsheets/d/1S1ORmz2W4QTG-d8odUOqT6Czu21NF2Vw/edit?usp=sharing&ouid=108685579276627434305&rtpof=true&sd=true +2. zarplaty.xlsx - Численность населения по населенным пунктам России. Источник: Росстат, https://rosstat.gov.ru/folder/12781. Ссылка: https://drive.google.com/file/d/19hJI_zlTZboxSh_JwPrWt8vYx9lkNlM0/view?usp=sharing + + +Для обучения использовали модель LightAutoML* + + +1. LightAutoML project from Sberbank AI Lab AutoML group is the framework for automatic classification and regression model creation. + + +Authors: Alexander Ryzhkov, Anton Vakhrushev, Dmitry Simakov, Vasilii Bunakov, Rinchin Damdinov, Pavel Shvets, Alexander Kirilin + + +2. Библиотеки +!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip +import pandas as pd +import numpy as np +import typing +import torch +import matplotlib.pyplot as plt + + +!pip install -U lightautoml +from sklearn.model_selection import train_test_split +from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML +from lightautoml.tasks import Task \ No newline at end of file diff --git a/Yapiki_publ/city_population.rar b/Yapiki_publ/city_population.rar new file mode 100644 index 0000000..4cad566 Binary files /dev/null and b/Yapiki_publ/city_population.rar differ diff --git a/Yapiki_publ/zarplaty.xlsx b/Yapiki_publ/zarplaty.xlsx new file mode 100644 index 0000000..296673c Binary files /dev/null and b/Yapiki_publ/zarplaty.xlsx differ