Uname:Linux Sandbox-A 4.4.0-210-generic #242-Ubuntu SMP Fri Apr 16 09:57:56 UTC 2021 x86_64

Base Dir : /var/www/html

User : gavin


403WebShell
403Webshell
Server IP : 68.183.124.220  /  Your IP : 216.73.217.137
Web Server : Apache/2.4.18 (Ubuntu)
System : Linux Sandbox-A 4.4.0-210-generic #242-Ubuntu SMP Fri Apr 16 09:57:56 UTC 2021 x86_64
User : gavin ( 1000)
PHP Version : 7.0.33-0ubuntu0.16.04.16
Disable Function : pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,
MySQL : OFF  |  cURL : ON  |  WGET : ON  |  Perl : ON  |  Python : ON  |  Sudo : ON  |  Pkexec : ON
Directory :  /home/gavin/workspace/comecondo/python/price_estimator/

Upload File :
current_dir [ Writeable ] document_root [ Writeable ]

 

Command :


[ Back ]     

Current File : /home/gavin/workspace/comecondo/python/price_estimator/zolo_listing.py
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 16 13:37:51 2021

@author: Gavin
"""
from sklearn.tree import DecisionTreeRegressor
from bs4 import BeautifulSoup
import mysql.connector
import os
import pandas as pd

# base_URL = 'https://www.zolo.ca/toronto-real-estate/'
base_URL = 'https://www.zolo.ca/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}

cities = ['toronto', 'oakville', 'markham', 'hamilton', 'mississauga', 'newmarket', 'burlington', 'kitchener', 'vaughan']

df_all = pd.DataFrame(columns=['City', 'Latitude', 'Longitude', 'Beds', 'Baths', 'Size', 'Age', 'Price'])
# os.chdir('C:\\Users\Gavin\workspace\ComeCondo\Python')
print(os.getcwd())
for city in cities:
    for p in range(1,5):
        data_file_name = 'data/'+city+str(p)+'.txt'
        # print (data_file_name)
        if os.path.isfile(data_file_name):
            # print ("Exists")
            file = open(data_file_name, mode='rb')
            all_html_content = file.read()
            file.close()
            soup = BeautifulSoup(all_html_content, 'html.parser')
            property_ul  = soup.find_all('ul', class_='card-listing--values truncate list-unstyled xs-flex-order-1 xs-mb05')
            property_div = soup.find_all('div', class_='card-listing--location text-5 truncate xs-flex-order-2')
            if (len(property_ul) == len(property_div)):
                for i in range(len(property_ul)):
                    if 'Free account required' not in str(property_ul[i]):
                        price = property_ul[i].find('span', itemprop='price').string
                        price = 'NA' if price is None else price.replace(',','').strip()
                        city = city
                        address = property_div[i].find('span', itemprop='streetAddress')                        
                        address = 'NA' if address is None else address.string.strip()
                        neighbourhood = property_div[i].find('span', class_='neighbourhood')
                        neighbourhood = 'NA' if neighbourhood is None else neighbourhood.text.replace('•','').strip()
                        latitude= property_div[i].find('meta', itemprop='latitude')['content']
                        latitude = 'NA' if latitude is None else latitude.strip()
                        longitude= property_div[i].find('meta', itemprop='longitude')['content']
                        longitude = 'NA' if longitude is None else longitude.strip()
                        dens = '0'
                        beds_baths_size = property_ul[i].findAll('li', class_='xs-inline')
                        beds_dens = beds_baths_size[0].string.replace('bed','')
                        if '+' in beds_dens: 
                            beds = beds_dens.split("+")[0].strip()
                            dens = beds_dens.split("+")[1].strip()
                        else:
                            beds = beds_dens.strip()
                        baths = beds_baths_size[1].string.replace('bath','')
                        size = 'NA' if len(beds_baths_size)<=2 else beds_baths_size[2].string.replace('sqft','').strip()
                        size = 'NA' if 'Years Old' in size else size
                        size = str(int(int(size.split('-')[0].strip())+int(size.split('-')[1].strip()))/2) if '-' in size else size.strip()
                        age = 'NA' if len(beds_baths_size)<=3 else beds_baths_size[3].string.replace('Years Old','').strip()
                        age = '0' if 'New' in age else age.replace('+','')
                        age = str(2021-int(age.replace('Built in','').strip())) if 'Built in' in age else age
                        age = str(int(int(age.split('-')[0].strip())+int(age.split('-')[1].strip()))/2) if '-' in age else age.strip()
                        #age = age.split('-')[1] if '-' in age else age
                        
                        print (city+', '+neighbourhood+', '+address+', '+latitude+', '+longitude+', '+beds+', '+dens+', '+baths+', '+size+', '+age+', '+price)
                        f = open('summary.csv', 'a')
                        f.write(city+', '+neighbourhood+', '+address+', '+latitude+', '+longitude+', '+beds+', '+dens+', '+baths+', '+size+', '+age+', '+price+'\n')
                        f.close()

Youez - 2016 - github.com/yon3zu
LinuXploit