Shimin233

Web scraping for Job Application

Reference:

from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests 
from urllib.request import urlopen
import re #for some string formatting

#LinkedIn
def linkedin_s():
    h=soup.title.string.find('hiring')
    i=soup.title.string.find(' in')
    print('company: ', soup.title.string[:h-1])
    print('job: ', soup.title.string[h+7:i])
    print('location: ', soup.title.string[i+4:-10])
    print('web: ', soup.title.string[-8:])  
#Target jobs
def targetjobs_s():
    scheme_location = soup.find_all('div', {"class":["pt-7","text-gray-700","font-medium","md:pt-5"]})[0] #ref:https://stackoverflow.com/a/22284921
    scheme_location.p #only the first <p>
    scheme_location.select('p') #all <p>'s in this <div> #ref: https://stackoverflow.com/a/49679586

    #level :grad, entry-level, etc
    scheme = scheme_location.select('p')[0].string
    result = [_.start() for _ in re.finditer('\n', scheme)] #ref: https://www.delftstack.com/howto/python/python-find-all-occurrences-in-string/
    for l in range(len(scheme)):
        if scheme[l] not in ['\n', ' ']:
            level = scheme[l:result[1]]
            break

    #location: GBR, City of London, etc.
    location = scheme_location.select('p')[1].string
    result2 = [_.start() for _ in re.finditer('\n', location)]
    for l in range(len(location)):
        if location[l] not in ['\n', ' ']:
            loc = location[l:result2[1]]
            break

    soup.find_all('a', {"href":"/organisations/pa-consulting"})
    co_ddl = soup.find_all('p', {"class":["text-gray",'text-base','line-clamp-1','mb-1']})

    #company
    co=co_ddl[0].a.string
    result3 = [_.start() for _ in re.finditer('\n', co)]
    for l in range(len(co)):
        if co[l] not in ['\n', ' ']:
            company = co[l:result3[1]]
            break

    #deadline
    try:
        ddl=co_ddl[1].text
        result4 = [_.start() for _ in re.finditer('\n', ddl)]
        for l in range(len(ddl)):
            if ddl[l] in '0123456789':
                ddl_ = ddl[l:result4[-1]]
                break
    except IndexError:
        ddl_ = 'No stated deadline'


    #job (from title)
    title = soup.title.string
    result5 = [_.start() for _ in re.finditer('\n',title)]
    for l in range(len(title)):
        if title[l] not in ['\n', ' ']:
            job = title[l:result5[-1]]
            break

    print('company: ', company)
    print('deadline: ', ddl_)
    print('job: ', job)        
    print('level: ', level)
    print('location: ', loc)


#Detect which web source first, then conduct

url = str(input('The job description page address: '))
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
web_list = ['linkedin', 'targetjobs']
 #conduct corresponding codes as below
if 'linkedin' in url:
    linkedin_s()
elif 'targetjobs' in url:
    targetjobs_s()
else:
    print('This website is not included yet; the supported websites are as below: ',
        ', '.join(web_list))

You can test with these job description pages:

https://www.linkedin.com/jobs/view/2902953432 (LinkedIn)
https://targetjobs.co.uk/jobs/graduate-opportunity-aca-trainee-18476 (Target jobs)
https://www.reed.co.uk/jobs/finance-analyst/45781563?source=searchResults&filter=%2fjobs%2ffinancial-analyst-graduate-jobs%3fparentsector%3daccountancy-qualified (‘This website is not included yet; the supported websites are as below: …’)