| Server IP : 68.183.124.220 / Your IP : 216.73.217.137 Web Server : Apache/2.4.18 (Ubuntu) System : Linux Sandbox-A 4.4.0-210-generic #242-Ubuntu SMP Fri Apr 16 09:57:56 UTC 2021 x86_64 User : gavin ( 1000) PHP Version : 7.0.33-0ubuntu0.16.04.16 Disable Function : pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority, MySQL : OFF | cURL : ON | WGET : ON | Perl : ON | Python : ON | Sudo : ON | Pkexec : ON Directory : /home/gavin/workspace/comecondo/python/price_estimator/ |
Upload File : |
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 16 13:37:51 2021
@author: Gavin
"""
from sklearn.tree import DecisionTreeRegressor
from bs4 import BeautifulSoup
import mysql.connector
import os
import pandas as pd
# base_URL = 'https://www.zolo.ca/toronto-real-estate/'
base_URL = 'https://www.zolo.ca/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
cities = ['toronto', 'oakville', 'markham', 'hamilton', 'mississauga', 'newmarket', 'burlington', 'kitchener', 'vaughan']
df_all = pd.DataFrame(columns=['City', 'Latitude', 'Longitude', 'Beds', 'Baths', 'Size', 'Age', 'Price'])
# os.chdir('C:\\Users\Gavin\workspace\ComeCondo\Python')
print(os.getcwd())
for city in cities:
for p in range(1,5):
data_file_name = 'data/'+city+str(p)+'.txt'
# print (data_file_name)
if os.path.isfile(data_file_name):
# print ("Exists")
file = open(data_file_name, mode='rb')
all_html_content = file.read()
file.close()
soup = BeautifulSoup(all_html_content, 'html.parser')
property_ul = soup.find_all('ul', class_='card-listing--values truncate list-unstyled xs-flex-order-1 xs-mb05')
property_div = soup.find_all('div', class_='card-listing--location text-5 truncate xs-flex-order-2')
if (len(property_ul) == len(property_div)):
for i in range(len(property_ul)):
if 'Free account required' not in str(property_ul[i]):
price = property_ul[i].find('span', itemprop='price').string
price = 'NA' if price is None else price.replace(',','').strip()
city = city
address = property_div[i].find('span', itemprop='streetAddress')
address = 'NA' if address is None else address.string.strip()
neighbourhood = property_div[i].find('span', class_='neighbourhood')
neighbourhood = 'NA' if neighbourhood is None else neighbourhood.text.replace('•','').strip()
latitude= property_div[i].find('meta', itemprop='latitude')['content']
latitude = 'NA' if latitude is None else latitude.strip()
longitude= property_div[i].find('meta', itemprop='longitude')['content']
longitude = 'NA' if longitude is None else longitude.strip()
dens = '0'
beds_baths_size = property_ul[i].findAll('li', class_='xs-inline')
beds_dens = beds_baths_size[0].string.replace('bed','')
if '+' in beds_dens:
beds = beds_dens.split("+")[0].strip()
dens = beds_dens.split("+")[1].strip()
else:
beds = beds_dens.strip()
baths = beds_baths_size[1].string.replace('bath','')
size = 'NA' if len(beds_baths_size)<=2 else beds_baths_size[2].string.replace('sqft','').strip()
size = 'NA' if 'Years Old' in size else size
size = str(int(int(size.split('-')[0].strip())+int(size.split('-')[1].strip()))/2) if '-' in size else size.strip()
age = 'NA' if len(beds_baths_size)<=3 else beds_baths_size[3].string.replace('Years Old','').strip()
age = '0' if 'New' in age else age.replace('+','')
age = str(2021-int(age.replace('Built in','').strip())) if 'Built in' in age else age
age = str(int(int(age.split('-')[0].strip())+int(age.split('-')[1].strip()))/2) if '-' in age else age.strip()
#age = age.split('-')[1] if '-' in age else age
print (city+', '+neighbourhood+', '+address+', '+latitude+', '+longitude+', '+beds+', '+dens+', '+baths+', '+size+', '+age+', '+price)
f = open('summary.csv', 'a')
f.write(city+', '+neighbourhood+', '+address+', '+latitude+', '+longitude+', '+beds+', '+dens+', '+baths+', '+size+', '+age+', '+price+'\n')
f.close()