import urllib3
http = urllib3.PoolManager()
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
globalRequest = 'https://www.usnews.com/education/best-global-universities/rankings?page={}'
htmlData = dict()
for i in range(1, 151):
print(i, end=' ')
html = http.request('GET', globalRequest.format(i), headers=hdr)
htmlData[i] = html.data
import re
from bs4 import BeautifulSoup
uniInfo = []
for i in range(1, 151):
soup = BeautifulSoup(htmlData[i], 'html.parser')
resultNode = soup.find(id='resultsMain')
for divNode in resultNode.findChildren('div', {'class' : 'sep'}):
rankNode = divNode.findChildren('span', {'class':'rankscore-bronze'})[0]
matchResult = re.search(r'#([0-9]*?)\s', rankNode.text)
rank = int(matchResult.group(1))
for node in divNode.findChildren('h2', {'class' : 'h-taut'}):
aNode = node.findChildren('a')[0]
name = aNode.text
for node in divNode.findChildren('div', {'class' : 't-taut'}):
cNode = node.findChildren('span')[0]
region = cNode.text
uniItem = {
'name' : name,
'region' : region,
'global_ranking' : rank
}
uniInfo.append(uniItem)
A total of 1500 universities in the world
print(len(uniInfo))
print(uniInfo[0])
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
allRegion = [item['region'] for item in uniInfo]
regionCounter = Counter(allRegion)
fig = plt.figure(figsize=(8, 16))
regionTop = regionCounter.most_common()
regionTopNames, regionTopCount = zip(*regionTop)
plt.barh(np.arange(len(regionTopNames)), regionTopCount)
plt.gca().invert_yaxis()
plt.yticks(np.arange(len(regionTopNames)), regionTopNames)
plt.ylim((len(regionTopNames)-0.5, -0.5))
plt.xlim((0, max(regionTopCount) + 40))
for i in range(len(regionTopCount)):
plt.text(regionTopCount[i] + 5, i - 0.3, repr(regionTopCount[i]), ha='left', va='top')
fig = plt.figure(figsize=(8, 16))
# group world rankings by region
regionRanking = dict()
for key in regionTopNames:
regionRanking[key] = []
rankTuples = [(item['region'], item['global_ranking']) for item in uniInfo]
for r, gr in rankTuples:
regionRanking[r].append(gr)
regionMedian = [np.median(regionRanking[r]) for r in regionTopNames]
plt.barh(np.arange(len(regionTopNames)), regionMedian)
plt.gca().invert_yaxis()
plt.yticks(np.arange(len(regionTopNames)), regionTopNames)
plt.ylim((len(regionTopNames)-0.5, -0.5))
plt.xlim((0, max(regionMedian) + 180))
for i in range(len(regionTopCount)):
plt.text(regionMedian[i] + 20, i - 0.3, '{} ({})'.format(int(regionMedian[i]), regionTopCount[i]),
ha='left', va='top')
from scipy.interpolate import interp1d
fig = plt.figure(figsize=(8, 20))
for i in range(len(regionTopNames)):
rawRanking = regionRanking[regionTopNames[i]]
if len(rawRanking) == 1:
percentile = np.asarray([1.0])
else:
percentile = np.arange(len(rawRanking))/ (len(rawRanking) - 1)
if percentile.size == 1:
interpFunc = lambda x : np.asarray([rawRanking[0]] * x.size)
else:
interpFunc = interp1d(percentile, rawRanking)
x = np.linspace(0.0, 1.0, 80)
c = interpFunc(x)
cbar = plt.scatter(x, [i] * x.size, 25, c, cmap='jet', marker='s', vmin=1, vmax=1500)
plt.gca().invert_yaxis()
plt.yticks(np.arange(len(regionTopNames)), regionTopNames)
plt.ylim((len(regionTopNames)-0.5, -0.5))
plt.xlim((0, 1.0))
plt.xlabel('Percentile within region')
plt.subplots_adjust(top=0.87)
barAx = plt.axes([0.1, 0.9, 0.8, 0.01])
plt.colorbar(cbar, cax=barAx, orientation='horizontal')