Common libraries

In [2]:
from __future__ import print_function
import os.path
from collections import defaultdict
import string
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.feature_extraction.text import CountVectorizer
import wordcloud
%matplotlib inline

Code progress utility

In [7]:
# https://github.com/alexanderkuk/log-progress
# Progress indicator utilitty
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

Sitemap list

In [1]:
sitemap_list = [ 
    {'url': 'https://www.ig.com/sitemap.xml', 'recursive': 1},
    {'url': 'https://www.home.saxo/sitemap.xml', 'recursive': 0},    
    {'url': 'https://www.fxcm.com/sitemap.xml', 'recursive': 1},  
    {'url': 'https://www.icmarkets.com/sitemap_index.xml', 'recursive': 1},  
    {'url': 'https://www.cmcmarkets.com/en/sitemap.xml', 'recursive': 0},
    {'url': 'https://www.oanda.com/sitemap.xml', 'recursive': 0},     
    {'url': 'http://www.fxpro.co.uk/en_sitemap.xml', 'recursive': 0}, 
    {'url': 'https://en.swissquote.com/sitemap.xml', 'recursive': 0}, 
    {'url': 'https://admiralmarkets.com/sitemap.xml', 'recursive': 0},     
    {'url': 'https://www.xtb.com/sitemap.xml', 'recursive': 1},       
    {'url': 'https://www.ufx.com/en-GB/sitemap.xml', 'recursive': 0},   
    {'url': 'https://www.markets.com/sitemap.xml', 'recursive': 0},   
    {'url': 'https://www.fxclub.org/sitemap.xml', 'recursive': 1},       
    {'url': 'https://www.teletrade.eu/sitemap.xml', 'recursive': 1},       
    {'url': 'https://bmfn.com/sitemap.xml', 'recursive': 0},       
    {'url': 'https://www.thinkmarkets.com/en/sitemap.xml', 'recursive': 0},  
    {'url': 'https://www.etoro.com/sitemap.xml', 'recursive': 1},  
    {'url': 'https://www.activtrades.com/en/sitemap_index.xml', 'recursive': 1},  
    {'url': 'http://www.fxprimus.com/sitemap.xml', 'recursive': 0}
]

sitemap_list
Out[1]:
[{'recursive': 1, 'url': 'https://www.ig.com/sitemap.xml'},
 {'recursive': 0, 'url': 'https://www.home.saxo/sitemap.xml'},
 {'recursive': 1, 'url': 'https://www.fxcm.com/sitemap.xml'},
 {'recursive': 1, 'url': 'https://www.icmarkets.com/sitemap_index.xml'},
 {'recursive': 0, 'url': 'https://www.cmcmarkets.com/en/sitemap.xml'},
 {'recursive': 0, 'url': 'https://www.oanda.com/sitemap.xml'},
 {'recursive': 0, 'url': 'http://www.fxpro.co.uk/en_sitemap.xml'},
 {'recursive': 0, 'url': 'https://en.swissquote.com/sitemap.xml'},
 {'recursive': 0, 'url': 'https://admiralmarkets.com/sitemap.xml'},
 {'recursive': 1, 'url': 'https://www.xtb.com/sitemap.xml'},
 {'recursive': 0, 'url': 'https://www.ufx.com/en-GB/sitemap.xml'},
 {'recursive': 0, 'url': 'https://www.markets.com/sitemap.xml'},
 {'recursive': 1, 'url': 'https://www.fxclub.org/sitemap.xml'},
 {'recursive': 1, 'url': 'https://www.teletrade.eu/sitemap.xml'},
 {'recursive': 0, 'url': 'https://bmfn.com/sitemap.xml'},
 {'recursive': 0, 'url': 'https://www.thinkmarkets.com/en/sitemap.xml'},
 {'recursive': 1, 'url': 'https://www.etoro.com/sitemap.xml'},
 {'recursive': 1, 'url': 'https://www.activtrades.com/en/sitemap_index.xml'},
 {'recursive': 0, 'url': 'http://www.fxprimus.com/sitemap.xml'}]

Web scraping

In [4]:
from fake_useragent import UserAgent
ua = UserAgent()
headers = ua.chrome
headers = {'User-Agent': headers}
In [46]:
result = requests.get(sitemap_list[3]['url'])
c = result.content
c = c.decode("utf-8-sig")
c
Out[46]:
'<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="//www.icmarkets.com/main-sitemap.xsl"?>\n<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/post-sitemap.xml</loc>\n\t\t<lastmod>2016-12-16T07:13:32-01:00</lastmod>\n\t</sitemap>\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/page-sitemap.xml</loc>\n\t\t<lastmod>2017-06-20T07:11:01+00:00</lastmod>\n\t</sitemap>\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/attachment-sitemap1.xml</loc>\n\t\t<lastmod>2014-07-01T15:44:46+00:00</lastmod>\n\t</sitemap>\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/attachment-sitemap2.xml</loc>\n\t\t<lastmod>2014-10-29T02:36:07-01:00</lastmod>\n\t</sitemap>\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/attachment-sitemap3.xml</loc>\n\t\t<lastmod>2015-03-15T18:41:51-01:00</lastmod>\n\t</sitemap>\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/attachment-sitemap4.xml</loc>\n\t\t<lastmod>2017-05-30T12:33:34+00:00</lastmod>\n\t</sitemap>\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/category-sitemap.xml</loc>\n\t\t<lastmod>2016-12-16T07:13:32-01:00</lastmod>\n\t</sitemap>\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/post_tag-sitemap.xml</loc>\n\t\t<lastmod>2014-03-27T01:14:54-01:00</lastmod>\n\t</sitemap>\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/csscategory-sitemap.xml</loc>\n\t\t<lastmod>2013-06-11T00:02:10+00:00</lastmod>\n\t</sitemap>\n\t<sitemap>\n\t\t<loc>https://www.icmarkets.com/author-sitemap.xml</loc>\n\t\t<lastmod>2017-05-05T06:44:19+00:00</lastmod>\n\t</sitemap>\n</sitemapindex>\n<!-- XML Sitemap generated by Yoast SEO -->'
In [5]:
# xml tree parsing
import xml.etree.ElementTree as ET


def xml2df(xml_data):
    root = ET.XML(xml_data) # element tree
    all_records = []
    for i, child in enumerate(root):
        record = {}
        for subchild in child:
            record[subchild.tag] = subchild.text
            all_records.append(record)
    return pd.DataFrame(all_records)
In [54]:
df = xml2df(c)

list(df['{http://www.sitemaps.org/schemas/sitemap/0.9}loc'].values)
Out[54]:
['https://www.icmarkets.com/post-sitemap.xml',
 'https://www.icmarkets.com/post-sitemap.xml',
 'https://www.icmarkets.com/page-sitemap.xml',
 'https://www.icmarkets.com/page-sitemap.xml',
 'https://www.icmarkets.com/attachment-sitemap1.xml',
 'https://www.icmarkets.com/attachment-sitemap1.xml',
 'https://www.icmarkets.com/attachment-sitemap2.xml',
 'https://www.icmarkets.com/attachment-sitemap2.xml',
 'https://www.icmarkets.com/attachment-sitemap3.xml',
 'https://www.icmarkets.com/attachment-sitemap3.xml',
 'https://www.icmarkets.com/attachment-sitemap4.xml',
 'https://www.icmarkets.com/attachment-sitemap4.xml',
 'https://www.icmarkets.com/category-sitemap.xml',
 'https://www.icmarkets.com/category-sitemap.xml',
 'https://www.icmarkets.com/post_tag-sitemap.xml',
 'https://www.icmarkets.com/post_tag-sitemap.xml',
 'https://www.icmarkets.com/csscategory-sitemap.xml',
 'https://www.icmarkets.com/csscategory-sitemap.xml',
 'https://www.icmarkets.com/author-sitemap.xml',
 'https://www.icmarkets.com/author-sitemap.xml']
In [60]:
a = [1]
b = [2]
a.extend(b)
a
Out[60]:
[1, 2]
In [8]:
end_sitemap_list = []

for sitemap in log_progress(sitemap_list, every=1):
    if(sitemap['recursive']==1):
        try:
            result = requests.get(sitemap['url'], headers=headers)
            c = result.content
            c = c.decode("utf-8-sig")
            df = xml2df(c)
            end_sitemap_list.extend(list(df['{http://www.sitemaps.org/schemas/sitemap/0.9}loc'].values))
        except:
            print(sitemap)
    else:
        end_sitemap_list.extend([sitemap['url']])
{'url': 'https://www.teletrade.eu/sitemap.xml', 'recursive': 1}
In [9]:
len(end_sitemap_list)
Out[9]:
241
In [10]:
new_df = pd.DataFrame(end_sitemap_list)
new_df.to_csv('test.csv')
In [11]:
result_df = pd.DataFrame(columns=['changefreq','loc','priority'])
result_df
Out[11]:
changefreq loc priority
In [12]:
# change list to unique list
end_sitemap_list = list(set(end_sitemap_list))
In [13]:
len(end_sitemap_list)
Out[13]:
203
In [14]:
result_df = pd.DataFrame(columns=['changefreq','loc','priority'])

for sitemap in log_progress(end_sitemap_list, every=1):
    
    result = requests.get(sitemap, headers=headers)
    c = result.content
    try:
        c = c.decode("utf-8-sig")
        df = xml2df(c)
        columns = [
            '{http://www.sitemaps.org/schemas/sitemap/0.9}changefreq',
            '{http://www.sitemaps.org/schemas/sitemap/0.9}loc',
            '{http://www.sitemaps.org/schemas/sitemap/0.9}priority'
        ]
        try: 
            df2 = df[columns]
            df2['source'] = sitemap
            df2.columns = ['changefreq','loc','priority','source']
        except:
            df2['loc'] = df['{http://www.sitemaps.org/schemas/sitemap/0.9}loc']
            df2['changefreq'] = ''
            df2['priority'] = ''
            df2['source'] = sitemap
        result_df = result_df.append(df2)
    except:
        print(sitemap)
    
/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
https://www.fxcm.com/markets/sitemap-video-en-markets.xml
https://admiralmarkets.com/sitemap.xml
https://www.fxcm.com/sitemap-main.xml.gz
https://www.fxcm.com/uk/sitemap-video-en-gb.xml
https://www.oanda.com/sitemap.xml
In [15]:
result_df.shape
Out[15]:
(14047393, 4)
In [136]:
masks = [1,2]
for idx, val in enumerate(masks):
    print(idx,val)
0 1
1 2
In [162]:
str1 = "this is string example....wow!!!";
str2 = "exam";

print (str1.find(str2))
print (str1.find(str2, 10))
print (str1.find(str2, 40))
15
15
-1
In [5]:
def process_http(string,masks):
    try:
        for idx, val in enumerate(masks):
            if (string.find(val)>-1):
                string = string.replace(val,'',1)
                return (string, idx)
        return (string, -1)
    except:
        return (string, -1)   
In [17]:
process_http('http://www.fxprimus.com/about-us/?lang=es', ['http://','https://'])
Out[17]:
('www.fxprimus.com/about-us/?lang=es', 0)
In [18]:
result_df.to_csv('all_sitemaps_2.csv')

Data analysis

In [2]:
result_df = pd.read_csv('all_sitemaps_2.csv')
/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [3]:
result_df.head()
Out[3]:
Unnamed: 0 changefreq loc priority source
0 0 always https://www.ig.com/no/aksjer-nyheter/2014/03/2... 0.5 https://www.ig.com/no/sitemap-no-6.xml
1 1 always https://www.ig.com/no/aksjer-nyheter/2014/03/2... 0.5 https://www.ig.com/no/sitemap-no-6.xml
2 2 always https://www.ig.com/no/aksjer-nyheter/2014/03/2... 0.5 https://www.ig.com/no/sitemap-no-6.xml
3 3 always https://www.ig.com/no/aksjer-nyheter/2014/03/2... 0.5 https://www.ig.com/no/sitemap-no-6.xml
4 4 always https://www.ig.com/no/aksjer-nyheter/2014/03/2... 0.5 https://www.ig.com/no/sitemap-no-6.xml
In [6]:
result_df['protocol'] = result_df['loc'].apply(lambda x: process_http(x,['http://','https://'])[1])
result_df['loc'] = result_df['loc'].apply(lambda x: process_http(x,['http://','https://'])[0])
In [7]:
result_df.head()
Out[7]:
Unnamed: 0 changefreq loc priority source protocol
0 0 always www.ig.com/no/aksjer-nyheter/2014/03/25/blackb... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1
1 1 always www.ig.com/no/aksjer-nyheter/2014/03/25/blackb... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1
2 2 always www.ig.com/no/aksjer-nyheter/2014/03/25/blackb... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1
3 3 always www.ig.com/no/aksjer-nyheter/2014/03/25/blackb... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1
4 4 always www.ig.com/no/aksjer-nyheter/2014/03/25/blackb... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1
In [13]:
table = pd.pivot_table(
    result_df,
    index=["broker"],
    columns=["protocol", "www"],
    values=["loc"],
    aggfunc={"loc":len},fill_value=0
)
In [14]:
table
Out[14]:
loc
protocol -1 0 1
www -1 -1 0 -1 0
broker
3755472 0 0 0 0
activtrades.com 0 0 0 0 5108
bmfn.com 0 0 0 7220 0
cmcmarkets.com 0 0 0 0 317
en. 0 1352 0 0 0
etoro.com 0 0 0 0 58264
etoro.com.cn 0 0 0 0 4015
fxclub.org 0 0 0 0 184833
fxcm.com 0 0 120 0 14698
fxprimus.com 0 0 12768 0 0
fxpro.co.uk 0 0 4464 0 0
help.etoro.com 0 2 0 0 0
home.saxo 0 0 0 0 2632
icmarkets.com 0 0 0 0 11774
ig.com 0 0 0 0 9976623
landing.fxcm.com 0 0 0 428 0
markets.com 0 0 6477 0 0
thinkmarkets.com 0 0 0 0 242
ufx.com 0 0 0 0 444
xtb.com 0 0 0 0 140
In [9]:
result_df['www'] = result_df['loc'].apply(lambda x: process_http(x,['www.'])[1])
result_df['loc'] = result_df['loc'].apply(lambda x: process_http(x,['www.'])[0])
In [10]:
result_df.head()
Out[10]:
Unnamed: 0 changefreq loc priority source protocol www
0 0 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0
1 1 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0
2 2 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0
3 3 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0
4 4 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0
In [197]:
try_split('fxprimus.com/vu-home/?lang=ar','/',0)
Out[197]:
'fxprimus.com'
In [11]:
def try_split(string, sep,idx):
    try:
        return string.split(sep)[idx]
    except:
        return ''
In [12]:
result_df['broker'] = result_df['loc'].apply(lambda x: try_split(x,'/',0))
In [15]:
table = pd.pivot_table(
    result_df,
    index=["broker"],
    columns=["protocol", "www"],
    values=["loc"],
    aggfunc={"loc":len},fill_value=0
)
table
Out[15]:
loc
protocol -1 0 1
www -1 -1 0 -1 0
broker
3755472 0 0 0 0
activtrades.com 0 0 0 0 5108
bmfn.com 0 0 0 7220 0
cmcmarkets.com 0 0 0 0 317
en. 0 1352 0 0 0
etoro.com 0 0 0 0 58264
etoro.com.cn 0 0 0 0 4015
fxclub.org 0 0 0 0 184833
fxcm.com 0 0 120 0 14698
fxprimus.com 0 0 12768 0 0
fxpro.co.uk 0 0 4464 0 0
help.etoro.com 0 2 0 0 0
home.saxo 0 0 0 0 2632
icmarkets.com 0 0 0 0 11774
ig.com 0 0 0 0 9976623
landing.fxcm.com 0 0 0 428 0
markets.com 0 0 6477 0 0
thinkmarkets.com 0 0 0 0 242
ufx.com 0 0 0 0 444
xtb.com 0 0 0 0 140
In [16]:
result_df.to_csv('all_sitemaps_2.csv')
In [17]:
result_df['first_nav'] = result_df['loc'].apply(lambda x: try_split(x,'/',1))
In [18]:
1+1
Out[18]:
2
In [19]:
result_df.to_csv('all_sitemaps_2.csv')
In [23]:
table = pd.pivot_table(
    result_df,
    index=["first_nav"],
    columns=["broker"],
    values=["loc"],
    aggfunc={"loc":len},fill_value=0
)
table
Out[23]:
loc
broker activtrades.com bmfn.com cmcmarkets.com en. etoro.com etoro.com.cn fxclub.org fxcm.com fxprimus.com ... help.etoro.com home.saxo icmarkets.com ig.com landing.fxcm.com markets.com teletrade.eu thinkmarkets.com ufx.com xtb.com
first_nav
2692135 0 4 0 3 2 0 3 0 0 ... 0 4 2 0 0 3 11 0 0 0
10cashbackbonus 0 0 0 0 0 0 0 0 0 8 ... 0 0 0 0 0 0 0 0 0 0
13-oil-prices-likely-to-remain-volatile 0 0 0 0 0 0 0 0 0 4 ... 0 0 0 0 0 0 0 0 0 0
15-trading-bonus 0 0 0 0 0 0 0 0 0 8 ... 0 0 0 0 0 0 0 0 0 0
2010 0 0 0 0 0 0 0 0 0 0 ... 0 0 8 0 0 0 0 0 0 0
2011 0 0 0 0 0 0 0 0 0 0 ... 0 0 4 0 0 0 0 0 0 0
2012 0 0 0 0 0 0 0 0 0 0 ... 0 0 36 0 0 0 0 0 0 0
2013 0 0 0 0 0 0 0 0 0 0 ... 0 0 92 0 0 0 0 0 0 0
2014 0 0 0 0 0 0 0 0 0 0 ... 0 0 6424 0 0 0 0 0 0 0
2015 0 0 0 0 0 0 0 0 0 0 ... 0 0 3116 0 0 0 0 0 0 0
2016 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
22-the-week-in-focus-15-february-19-february 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
23-the-week-in-focus-22-february-26-february 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
24-the-week-in-focus-29-february-4-march 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
25-the-week-in-focus-07-march-13-march 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
26-the-week-in-focus-14-march-18-march 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
28-the-week-in-focus-28-march-1-april 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
28-times-of-the-trumple 0 0 0 0 0 0 0 0 0 44 ... 0 0 0 0 0 0 0 0 0 0
29-the-week-in-focus-4-april-8-april 0 0 0 0 0 0 0 0 0 16 ... 0 0 0 0 0 0 0 0 0 0
30-londons-war-on-terror-sends-markets-plummeting 0 0 0 0 0 0 0 0 0 44 ... 0 0 0 0 0 0 0 0 0 0
30-the-week-in-focus-11-april-15-april 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
31-fomc-june-special 0 0 0 0 0 0 0 0 0 44 ... 0 0 0 0 0 0 0 0 0 0
31-the-week-in-focus-18-april-22-april 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
32-the-week-in-focus-25-april-29-april 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
33-the-week-in-focus-01-mayl-06-may 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
34-the-week-in-focus-09-may-13-may 0 0 0 0 0 0 0 0 0 16 ... 0 0 0 0 0 0 0 0 0 0
35-the-week-in-focus-16-may 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
36-the-week-in-focus-23-may 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
38-the-week-in-focus-06-june 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
39-the-week-in-focus-13-june 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
yen-weakens-further-in-asia-as-fed-hike-debate-grows 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weakens-further-in-asia-bank-lending-gains-noted 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-after-boj-minutes-shows-negative-interest-rate-concerns 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-after-cpi-comes-in-flat-for-february 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-after-data-sets-as-asia-focus-turns-to-caixing-pmi 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-after-gain-in-corporate-services-index-fed-seen-key 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-after-gdp-surprises-on-upside-china-trade-data-ahead 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-after-kuroda-comments-on-economy-39-s-progress 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-after-q1-gdp-risk-events-in-focus 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-after-q2-gdp-disappoints-stimulus-plans-eyed 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-after-surprise-spike-in-january-core-machinery-orders 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-ahead-of-retail-sales-jobs-household-spending 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-ahead-of-trade-data-markets-focused-on-brexit 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-as-eyes-turn-to-u-s-jobs-g-20-summit 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-as-markets-mull-fed-views-for-november 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-as-safe-haven-demand-eases-on-polls-pound-gains 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-in-asia-after-mixed-data-u-s-nonfarm-payrolls-eyed 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-in-asia-after-trade-data-eyes-on-fed-boj 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-in-asia-after-trade-data-fed-aussie-jobs-ahead 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-in-asia-as-china-and-japan-data-paint-mixed-outlook 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-in-early-asia-with-bank-of-japan-ahead 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-in-early-asia-with-focus-on-fed-harker-sees-hikes 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
yen-weaker-in-early-asia-with-focus-this-week-on-yellen-views 0 0 4 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
za 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 515288 0 0 0 0 0 0
zatronet-li-mirovoi-finansovyi-krizis-2011-2012-gg-rossiyu 0 0 0 0 0 0 0 3 0 0 ... 0 0 0 0 0 0 0 0 0 0
zew-survey-nfib-small-business-optimism-japan-machinery-orders 0 0 0 0 0 0 0 0 0 20 ... 0 0 0 0 0 0 0 0 0 0
zh 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 384979 0 0 0 0 0 0
zh-tw 0 0 0 0 0 2419 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
zhurnal-forex-magazine 0 0 0 0 0 0 0 4 0 0 ... 0 0 0 0 0 0 0 0 0 0
zona-evro 0 0 0 0 0 0 0 4 0 0 ... 0 0 0 0 0 0 0 0 0 0

2857 rows × 21 columns

In [24]:
table.to_csv('first_split.csv')
In [20]:
result_df['second_nav'] = result_df['loc'].apply(lambda x: try_split(x,'/',2))
In [27]:
table = pd.pivot_table(
    result_df,
    index=["first_nav","second_nav"],
    columns=["broker"],
    values=["loc"],
    aggfunc={"loc":len},fill_value=0
)
table
Out[27]:
loc
broker activtrades.com bmfn.com cmcmarkets.com en. etoro.com etoro.com.cn fxclub.org fxcm.com fxprimus.com ... help.etoro.com home.saxo icmarkets.com ig.com landing.fxcm.com markets.com teletrade.eu thinkmarkets.com ufx.com xtb.com
first_nav second_nav
2692135 0 4 0 3 2 0 3 0 0 ... 0 4 2 0 0 3 11 0 0 0
10cashbackbonus ?lang=cn 0 0 0 0 0 0 0 0 0 4 ... 0 0 0 0 0 0 0 0 0 0
?lang=vi 0 0 0 0 0 0 0 0 0 4 ... 0 0 0 0 0 0 0 0 0 0
13-oil-prices-likely-to-remain-volatile ?lang=cn 0 0 0 0 0 0 0 0 0 4 ... 0 0 0 0 0 0 0 0 0 0
15-trading-bonus ?lang=cn 0 0 0 0 0 0 0 0 0 4 ... 0 0 0 0 0 0 0 0 0 0
?lang=vi 0 0 0 0 0 0 0 0 0 4 ... 0 0 0 0 0 0 0 0 0 0
2010 10 0 0 0 0 0 0 0 0 0 0 ... 0 0 8 0 0 0 0 0 0 0
2011 12 0 0 0 0 0 0 0 0 0 0 ... 0 0 4 0 0 0 0 0 0 0
2012 01 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
02 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
03 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
05 0 0 0 0 0 0 0 0 0 0 ... 0 0 4 0 0 0 0 0 0 0
06 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
07 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
08 0 0 0 0 0 0 0 0 0 0 ... 0 0 10 0 0 0 0 0 0 0
09 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 ... 0 0 8 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
2013 03 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
04 0 0 0 0 0 0 0 0 0 0 ... 0 0 8 0 0 0 0 0 0 0
05 0 0 0 0 0 0 0 0 0 0 ... 0 0 10 0 0 0 0 0 0 0
06 0 0 0 0 0 0 0 0 0 0 ... 0 0 4 0 0 0 0 0 0 0
07 0 0 0 0 0 0 0 0 0 0 ... 0 0 4 0 0 0 0 0 0 0
08 0 0 0 0 0 0 0 0 0 0 ... 0 0 4 0 0 0 0 0 0 0
09 0 0 0 0 0 0 0 0 0 0 ... 0 0 44 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 ... 0 0 8 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 0 0 0 ... 0 0 4 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0 ... 0 0 4 0 0 0 0 0 0 0
2014 02 0 0 0 0 0 0 0 0 0 0 ... 0 0 4 0 0 0 0 0 0 0
03 0 0 0 0 0 0 0 0 0 0 ... 0 0 197 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
zh trading-sectors 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 46 0 0 0 0 0 0
trading-shares 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 26 0 0 0 0 0 0
trading-stops 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 28 0 0 0 0 0 0
two-factor-authentication 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 26 0 0 0 0 0 0
types-of-etp 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 27 0 0 0 0 0 0
types-of-orders 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 19 0 0 0 0 0 0
types-of-risk 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 19 0 0 0 0 0 0
us-presidential-election 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 29 0 0 0 0 0 0
us-stocks-irs-tax-forms 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 46 0 0 0 0 0 0
ways-to-manage-risk 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 27 0 0 0 0 0 0
ways-to-short-sell 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 26 0 0 0 0 0 0
ways-to-trade-forex 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 28 0 0 0 0 0 0
ways-to-trade-indices 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 27 0 0 0 0 0 0
welcome 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 12 0 0 0 0 0 0
welcome-to-your-account 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 17 0 0 0 0 0 0
what-is-a-trading-plan 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 20 0 0 0 0 0 0
what-is-an-etp 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 18 0 0 0 0 0 0
what-is-short-selling 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 19 0 0 0 0 0 0
whats-new 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 24 0 0 0 0 0 0
why-use-a-trading-plan 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 28 0 0 0 0 0 0
whyig 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 28 0 0 0 0 0 0
windows-phone 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 49 0 0 0 0 0 0
windows-tablet 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 28 0 0 0 0 0 0
your-trading-plan 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 28 0 0 0 0 0 0
zh-tw 0 0 0 0 0 2 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
discover 0 0 0 0 0 21 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
funds 0 0 0 0 0 104 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
markets 0 0 0 0 0 2292 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
zhurnal-forex-magazine 0 0 0 0 0 0 0 4 0 0 ... 0 0 0 0 0 0 0 0 0 0
zona-evro 0 0 0 0 0 0 0 4 0 0 ... 0 0 0 0 0 0 0 0 0 0

39445 rows × 21 columns

In [29]:
table.to_csv('second_split.csv')
In [21]:
result_df.to_csv('all_sitemaps_2.csv')

Go deeper in Data analysis / prepare data

In [33]:
# mapping1.xlsx and mapping2.xlsx are manual annotation datasets 
In [1]:
from IPython.display import Image
Image("manual_annotation.png")
Out[1]:
In [22]:
mapping_1_df = pd.read_excel('mapping1.xlsx')
mapping_1_df
Out[22]:
group first_split_type second_split_type
0 instruments language-alias instruments
1 news language-alias news
2 how-to_blog language-alias how-to
3 other language-alias other
4 static language-alias static
5 cTrader language-alias metatrader
6 lp language-alias lp
7 how-to_blog news_stats article
8 other news_stats other
9 how-to_blog news_stats time
10 how-to_blog news_stats author
11 news news_stats news
12 news news news
13 news news time
14 news news other
15 news other (blank)
16 news other news
17 news other how-to
18 news other legal
19 news other time
20 news other static
21 instruments other instruments
22 news other lp
23 news time news
24 instruments instuments instuments
25 other long-tail (blank)
26 instruments long-tail instruments
27 static long-tail static
28 how-to_blog long-tail how-to
29 other long-tail lp
30 instruments stats stats
31 static static static
32 how-to_blog how-to how-to
33 cTrader how-to metatrader
In [23]:
mapping_2_df = pd.read_excel('mapping2.xlsx')
mapping_2_df
Out[23]:
first_split second_split first_split_type second_split_type
0 es ig-acciones language-alias instruments
1 fr ig-actions language-alias instruments
2 13-oil-prices-likely-to-remain-volatile ?lang=cn news news
3 15-trading-bonus ?lang=cn news news
4 15-trading-bonus ?lang=vi news news
5 2010 10 time news
6 2011 12 time news
7 2012 1 time news
8 2012 2 time news
9 2012 3 time news
10 2012 5 time news
11 2012 6 time news
12 2012 7 time news
13 2012 8 time news
14 2012 9 time news
15 2012 10 time news
16 2012 12 time news
17 2013 3 time news
18 2013 4 time news
19 2013 5 time news
20 2013 6 time news
21 2013 7 time news
22 2013 8 time news
23 2013 9 time news
24 2013 10 time news
25 2013 11 time news
26 2013 12 time news
27 2014 2 time news
28 2014 3 time news
29 2014 4 time news
... ... ... ... ...
39414 zh trading-sectors language-alias how-to
39415 zh trading-shares language-alias instruments
39416 zh trading-stops language-alias how-to
39417 yen-weaker-in-asia-after-trade-data-fed-aussie... NaN long-tail NaN
39418 yen-weaker-in-asia-as-china-and-japan-data-pai... NaN long-tail NaN
39419 yen-weaker-in-early-asia-with-bank-of-japan-ahead NaN long-tail NaN
39420 yen-weaker-in-early-asia-with-focus-on-fed-har... NaN long-tail NaN
39421 zatronet-li-mirovoi-finansovyi-krizis-2011-201... NaN long-tail NaN
39422 zew-survey-nfib-small-business-optimism-japan-... ?lang=cn long-tail NaN
39423 zh ways-to-manage-risk language-alias how-to
39424 zh ways-to-short-sell language-alias how-to
39425 zh ways-to-trade-forex language-alias how-to
39426 zh ways-to-trade-indices language-alias how-to
39427 zew-survey-nfib-small-business-optimism-japan-... ?lang=es long-tail NaN
39428 zew-survey-nfib-small-business-optimism-japan-... ?lang=id long-tail NaN
39429 zh what-is-a-trading-plan language-alias how-to
39430 zh what-is-an-etp language-alias how-to
39431 zh what-is-short-selling language-alias how-to
39432 zh whats-new language-alias how-to
39433 zh why-use-a-trading-plan language-alias how-to
39434 zh whyig language-alias lp
39435 zew-survey-nfib-small-business-optimism-japan-... ?lang=my long-tail NaN
39436 zew-survey-nfib-small-business-optimism-japan-... ?lang=th long-tail NaN
39437 zh your-trading-plan language-alias how-to
39438 zh-tw NaN language-alias other
39439 zh-tw discover language-alias other
39440 zh-tw funds language-alias other
39441 zh-tw markets language-alias other
39442 zhurnal-forex-magazine NaN long-tail NaN
39443 zona-evro NaN long-tail NaN

39444 rows × 4 columns

In [5]:
result_df = pd.read_csv('all_sitemaps.csv')
result_df
/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (1,2,7,8,9,10,11,12,13) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[5]:
Unnamed: 0 changefreq loc priority source protocol www broker first_nav second_nav third_nav fourth group language word_vectors
0 0 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog is is ig video what is forex 3903255001001
1 1 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog is is ig video what is forex 3903255001001
2 2 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog is is ig video what is forex 3903255001001
3 3 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog is is ig video what is forex 3903255001001
4 4 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog is is ig video what is forex 3903255001001
5 5 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog is is ig video what is bitcoin 3917234263001
6 6 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog is is ig video what is bitcoin 3917234263001
7 7 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog is is ig video what is bitcoin 3917234263001
8 8 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog is is ig video what is bitcoin 3917234263001
9 9 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog is is ig video what is bitcoin 3917234263001
10 10 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog is is ig video what are shares 3838181878001
11 11 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog is is ig video what are shares 3838181878001
12 12 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog is is ig video what are shares 3838181878001
13 13 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog is is ig video what are shares 3838181878001
14 14 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog is is ig video what are shares 3838181878001
15 15 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog is is ig video what are commodities 3716706859001
16 16 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog is is ig video what are commodities 3716706859001
17 17 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog is is ig video what are commodities 3716706859001
18 18 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog is is ig video what are commodities 3716706859001
19 19 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog is is ig video what are commodities 3716706859001
20 20 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog is is ig video what are stock indices 3716738462001
21 21 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog is is ig video what are stock indices 3716738462001
22 22 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog is is ig video what are stock indices 3716738462001
23 23 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog is is ig video what are stock indices 3716738462001
24 24 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog is is ig video what are stock indices 3716738462001
25 25 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog is is ig video binaries explained 3785420547001
26 26 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog is is ig video binaries explained 3785420547001
27 27 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog is is ig video binaries explained 3785420547001
28 28 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog is is ig video binaries explained 3785420547001
29 29 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog is is ig video binaries explained 3785420547001
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5668971 5668971 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668972 5668972 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668973 5668973 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668974 5668974 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668975 5668975 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668976 5668976 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668977 5668977 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668978 5668978 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668979 5668979 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668980 5668980 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668981 5668981 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668982 5668982 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668983 5668983 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668984 5668984 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668985 5668985 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668986 5668986 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668987 5668987 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668988 5668988 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668989 5668989 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668990 5668990 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668991 5668991 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668992 5668992 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668993 5668993 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668994 5668994 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668995 5668995 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668996 5668996 always ig.com/fr-ch/ig-actions/canadian-imperial-bank... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions canadian-imperial-bank-of-commerce-CM-CA NaN instruments fr-ch fr ch ig actions canadian imperial bank of com...
5668997 5668997 always ig.com/fr-ch/ig-actions/unione-di-banche-itali... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions unione-di-banche-italiane-UBI-IT NaN instruments fr-ch fr ch ig actions unione di banche italiane UBI IT
5668998 5668998 always ig.com/fr-ch/ig-actions/unione-di-banche-itali... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions unione-di-banche-italiane-UBI-IT NaN instruments fr-ch fr ch ig actions unione di banche italiane UBI IT
5668999 5668999 always ig.com/fr-ch/ig-actions/unione-di-banche-itali... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions unione-di-banche-italiane-UBI-IT NaN instruments fr-ch fr ch ig actions unione di banche italiane UBI IT
5669000 5669000 always ig.com/fr-ch/ig-actions/unione-di-banche-itali... 0.5 https://www.ig.com/fr-ch/sitemap-fr-ch-4.xml 1 0 ig.com fr-ch ig-actions unione-di-banche-italiane-UBI-IT NaN instruments fr-ch fr ch ig actions unione di banche italiane UBI IT

5669001 rows × 15 columns

In [79]:
del result_df['word_vectors']
result_df.head()
Out[79]:
changefreq loc priority source protocol www broker first_nav second_nav third_nav fourth
0 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN
1 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN
2 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN
3 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN
4 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN
In [10]:
result_df['third_nav'] = result_df['loc'].apply(lambda x: try_split(x,'/',3))
In [17]:
result_df['fourth'] = result_df['loc'].apply(lambda x: try_split(x,'/',4))
In [24]:
result_df = pd.merge(
    result_df,
    mapping_2_df,
    how='left',
    left_on=['first_nav', 'second_nav'],
    right_on=['first_split', 'second_split'])
result_df
Out[24]:
Unnamed: 0 changefreq loc priority source protocol www broker first_nav second_nav first_split second_split first_split_type second_split_type
0 0 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
1 1 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
2 2 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
3 3 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
4 4 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
5 5 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
6 6 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
7 7 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
8 8 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
9 9 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
10 10 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
11 11 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
12 12 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
13 13 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14 14 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
15 15 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
16 16 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
17 17 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
18 18 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
19 19 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
20 20 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
21 21 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
22 22 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
23 23 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
24 24 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
25 25 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
26 26 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
27 27 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
28 28 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
29 29 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14047363 89745 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047364 89746 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047365 89747 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047366 89748 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047367 89749 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047368 89750 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047369 89751 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047370 89752 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047371 89753 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047372 89754 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047373 89755 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047374 89756 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047375 89757 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047376 89758 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047377 89759 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047378 89760 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047379 89761 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047380 89762 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047381 89763 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047382 89764 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047383 89765 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047384 89766 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047385 89767 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047386 89768 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047387 89769 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047388 89770 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047389 89771 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047390 89772 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047391 89773 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news
14047392 89774 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news

14047393 rows × 14 columns

In [25]:
result_df = pd.merge(
    result_df,
    mapping_1_df,
    how='left',
    left_on=['first_split_type', 'second_split_type'],
    right_on=['first_split_type', 'second_split_type'])

result_df
Out[25]:
Unnamed: 0 changefreq loc priority source protocol www broker first_nav second_nav first_split second_split first_split_type second_split_type group
0 0 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
1 1 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
2 2 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
3 3 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
4 4 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
5 5 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
6 6 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
7 7 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
8 8 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
9 9 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
10 10 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
11 11 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
12 12 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
13 13 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14 14 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
15 15 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
16 16 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
17 17 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
18 18 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
19 19 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
20 20 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
21 21 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
22 22 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
23 23 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
24 24 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
25 25 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
26 26 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
27 27 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
28 28 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
29 29 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14047363 89745 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047364 89746 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047365 89747 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047366 89748 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047367 89749 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047368 89750 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047369 89751 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047370 89752 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047371 89753 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047372 89754 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047373 89755 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047374 89756 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047375 89757 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047376 89758 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047377 89759 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047378 89760 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047379 89761 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047380 89762 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047381 89763 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047382 89764 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047383 89765 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047384 89766 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047385 89767 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047386 89768 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047387 89769 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047388 89770 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047389 89771 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047390 89772 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047391 89773 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news
14047392 89774 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter no aksjer-nyheter language-alias news news

14047393 rows × 15 columns

In [26]:
result_df['language'] = result_df[result_df['first_split_type']=='language-alias'].first_nav
    
In [27]:
result_df[result_df['first_split_type']!='language-alias']
Out[27]:
Unnamed: 0 changefreq loc priority source protocol www broker first_nav second_nav first_split second_split first_split_type second_split_type group language
220394 94872 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220395 94873 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220396 94874 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220397 94875 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220398 94876 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220399 94877 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220400 94878 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220401 94879 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220402 94880 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220403 94881 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220404 94882 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220405 94883 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220406 94884 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220407 94885 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220408 94886 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220409 94887 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220410 94888 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220411 94889 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220412 94890 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220413 94891 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220414 94892 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220415 94893 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220416 94894 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220417 94895 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220418 94896 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220419 94897 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220420 94898 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220421 94899 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220422 94900 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
220423 94901 always ig.com/it-ch 1 https://www.ig.com/it-ch/sitemap-it-ch-4.xml 1 0 ig.com it-ch NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13957588 99810 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957589 99811 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957590 99812 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957591 99813 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957592 99814 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957593 99815 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957594 99816 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957595 99817 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957596 99818 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957597 99819 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957598 99820 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957599 99821 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957600 99822 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957601 99823 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957602 99824 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957603 99825 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957604 99826 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957605 99827 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957606 99828 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957607 99829 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957608 99830 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957609 99831 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957610 99832 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957611 99833 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957612 99834 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957613 99835 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957614 99836 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957615 99837 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957616 99838 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN
13957617 99839 NaN NaN NaN https://www.icmarkets.com/attachment-sitemap4.xml -1 -1 NaN NaN NaN NaN NaN NaN

4006860 rows × 16 columns

In [28]:
del result_df['second_split_type']
del result_df['first_split_type']
del result_df['second_split']
del result_df['first_split']
In [52]:
result_df['loc'].to_csv('url_list.csv')
In [90]:
result_df.to_csv('all_sitemaps_temp.csv')
In [43]:
'ig.com/is/ig-video/what-is-forex--3903255001001'.split('/')
Out[43]:
['ig.com', 'is', 'ig-video', 'what-is-forex--3903255001001']
In [56]:
split_url('ig.com/ae/market-update/2016/03/10/running-out-of-grenades-isnt-a-bad-thing-31238','/','-')
Out[56]:
['ae',
 'market',
 'update',
 '2016',
 '03',
 '10',
 'running',
 'out',
 'of',
 'grenades',
 'isnt',
 'a',
 'bad',
 'thing',
 '31238']
In [3]:
result_df = pd.read_csv('all_sitemaps.csv')
result_df
/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py:2683: DtypeWarning: Columns (1,2,7,8,9,10,11,12) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[3]:
Unnamed: 0 changefreq loc priority source protocol www broker first_nav second_nav third_nav fourth fondex group word_vectors
0 0 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'forex', '...
1 1 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'forex', '...
2 2 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'forex', '...
3 3 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'forex', '...
4 4 always ig.com/is/ig-video/what-is-forex--3903255001001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-forex--3903255001001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'forex', '...
5 5 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'bitcoin',...
6 6 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'bitcoin',...
7 7 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'bitcoin',...
8 8 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'bitcoin',...
9 9 always ig.com/is/ig-video/what-is-bitcoin--3917234263001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-is-bitcoin--3917234263001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'is', 'bitcoin',...
10 10 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'shares',...
11 11 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'shares',...
12 12 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'shares',...
13 13 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'shares',...
14 14 always ig.com/is/ig-video/what-are-shares--3838181878001 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-shares--3838181878001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'shares',...
15 15 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'commodit...
16 16 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'commodit...
17 17 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'commodit...
18 18 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'commodit...
19 19 always ig.com/is/ig-video/what-are-commodities--37167... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-commodities--3716706859001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'commodit...
20 20 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'stock', ...
21 21 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'stock', ...
22 22 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'stock', ...
23 23 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'stock', ...
24 24 always ig.com/is/ig-video/what-are-stock-indices--371... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video what-are-stock-indices--3716738462001 NaN how-to_blog ['is', 'ig', 'video', 'what', 'are', 'stock', ...
25 25 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog ['is', 'ig', 'video', 'binaries', 'explained',...
26 26 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog ['is', 'ig', 'video', 'binaries', 'explained',...
27 27 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog ['is', 'ig', 'video', 'binaries', 'explained',...
28 28 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog ['is', 'ig', 'video', 'binaries', 'explained',...
29 29 always ig.com/is/ig-video/binaries-explained-37854205... 0.5 https://www.ig.com/is/sitemap-is-1.xml 1 0 ig.com is ig-video binaries-explained-3785420547001 NaN how-to_blog ['is', 'ig', 'video', 'binaries', 'explained',...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13312804 13312804 always ig.com/cn/ig-shares/ipath-pure-beta-sugar-etn 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares ipath-pure-beta-sugar-etn NaN instruments ['cn', 'ig', 'shares', 'ipath', 'pure', 'beta'...
13312805 13312805 always ig.com/cn/ig-shares/ipath-pure-beta-sugar-etn 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares ipath-pure-beta-sugar-etn NaN instruments ['cn', 'ig', 'shares', 'ipath', 'pure', 'beta'...
13312806 13312806 always ig.com/cn/ig-shares/ipath-pure-beta-sugar-etn 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares ipath-pure-beta-sugar-etn NaN instruments ['cn', 'ig', 'shares', 'ipath', 'pure', 'beta'...
13312807 13312807 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312808 13312808 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312809 13312809 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312810 13312810 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312811 13312811 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312812 13312812 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312813 13312813 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312814 13312814 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312815 13312815 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312816 13312816 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312817 13312817 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312818 13312818 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312819 13312819 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312820 13312820 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312821 13312821 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312822 13312822 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312823 13312823 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312824 13312824 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312825 13312825 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312826 13312826 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312827 13312827 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312828 13312828 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312829 13312829 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312830 13312830 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312831 13312831 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312832 13312832 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...
13312833 13312833 always ig.com/cn/ig-shares/db-x-trackers---us-dollar-... 0.5 https://www.ig.com/cn/sitemap-cn-2.xml 1 0 ig.com cn ig-shares db-x-trackers---us-dollar-cash-ucits-etf NaN instruments ['cn', 'ig', 'shares', 'db', 'x', 'trackers', ...

13312834 rows × 14 columns

In [29]:
def split_url(string,sep1,sep2):
    try:
        split_list = string.split(sep1)
        result_list = []

        for idx, split_part in enumerate(split_list):
            if (idx>0):
                try:
                    split_part_list = split_part.split(sep2)
                    result_list.append(split_part_list)                 
                except:
                    result_list.append(split_part)  

        return ' '.join([item for sublist in result_list for item in sublist])
    except:
        return []
In [30]:
result_df['word_vectors'] = result_df['loc'].apply(lambda x: split_url(x,'/','-'))
In [31]:
result_df
Out[31]:
Unnamed: 0 changefreq loc priority source protocol www broker first_nav second_nav group language word_vectors
0 0 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
1 1 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
2 2 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
3 3 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
4 4 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
5 5 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
6 6 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
7 7 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
8 8 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
9 9 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
10 10 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
11 11 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
12 12 always ig.com/no/aksjer-nyheter/2014/03/25/blackberry... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 blackberry contin...
13 13 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
14 14 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
15 15 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
16 16 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
17 17 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
18 18 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
19 19 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
20 20 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
21 21 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
22 22 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
23 23 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
24 24 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
25 25 always ig.com/no/aksjer-nyheter/2014/03/25/king-prepa... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 king prepares to ...
26 26 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 banking sector sl...
27 27 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 banking sector sl...
28 28 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 banking sector sl...
29 29 always ig.com/no/aksjer-nyheter/2014/03/25/banking-se... 0.5 https://www.ig.com/no/sitemap-no-6.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 banking sector sl...
... ... ... ... ... ... ... ... ... ... ... ... ... ...
14047363 89745 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 Monday s uk broke...
14047364 89746 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 Monday s uk broke...
14047365 89747 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 Monday s uk broke...
14047366 89748 always ig.com/no/aksjer-nyheter/2014/03/24/Monday-s-u... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 Monday s uk broke...
14047367 89749 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047368 89750 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047369 89751 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047370 89752 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047371 89753 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047372 89754 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047373 89755 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047374 89756 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047375 89757 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047376 89758 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047377 89759 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047378 89760 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047379 89761 always ig.com/no/aksjer-nyheter/2014/03/24/aerospace-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 24 aerospace sector ...
14047380 89762 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047381 89763 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047382 89764 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047383 89765 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047384 89766 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047385 89767 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047386 89768 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047387 89769 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047388 89770 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047389 89771 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047390 89772 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047391 89773 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...
14047392 89774 always ig.com/no/aksjer-nyheter/2014/03/25/Tuesday-s-... 0.5 https://www.ig.com/no/sitemap-no-5.xml 1 0 ig.com no aksjer-nyheter news no no aksjer nyheter 2014 03 25 Tuesday s uk brok...

14047393 rows × 13 columns

In [32]:
result_df.to_csv('all_sitemaps_2.csv')
In [2]:
result_df = pd.read_csv('all_sitemaps.csv')
/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py:2683: DtypeWarning: Columns (1,2,7,8,9,10,11,12) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [33]:
result_df = result_df[['word_vectors','group', 'broker', 'language']]
In [34]:
result_df.to_csv('all_sitemaps_words_2.csv')

PCA / word2vec / cluster analysis

Naive group analysis

In [3]:
result_df = pd.read_csv('all_sitemaps_words_2.csv')
/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (2,4) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [12]:
result_df.language.fillna(value='na', inplace=True)
In [5]:
result_df[pd.isnull(result_df.language)==True]
Out[5]:
Unnamed: 0 word_vectors group broker language
In [13]:
texts = result_df[(result_df['group']=='instruments')&(result_df['language']=='en-ch')].word_vectors.fillna(value='').values.tolist()
In [6]:
table = pd.pivot_table(
    result_df,
    index=["language"],
    columns=["group"],
    values=["Unnamed: 0"],
    aggfunc={"Unnamed: 0":len},fill_value=0
)
table
Out[6]:
Unnamed: 0
group cTrader how-to_blog instruments lp news other static
language
ae 0 3449 380426 28 68803 2036 197
ar 16 20 4 0 20 6614 0
ar-ae 0 1351 380427 28 186 2109 49
at 52 8703 380398 0 23412 4113 0
au 44 12681 381994 209 131759 4404 49
bd 0 0 0 0 0 24 0
bg 22 538 1168 2 29 751 51
ca 22 120 196 0 0 212 0
ch 44 7716 380275 0 23214 3875 0
cn 60 1495 380642 28 830 2215 49
cy 0 498 1144 0 0 613 49
cz 0 498 1144 0 0 623 49
de 68 9532 380488 0 23566 12671 0
deu 0 528 1048 0 0 796 0
dk 0 40 1558 0 0 670 0
ee 0 498 98 0 0 0 0
en 20 682 3588 13 42 1408 63
en-GB 0 20 0 0 0 364 56
en-ch 44 6037 380683 28 118594 2238 49
es 60 8269 383018 0 6664 9823 0
esp 0 523 1264 0 0 644 0
fi 0 498 1018 0 0 613 49
fr 60 8775 413134 0 19291 11848 0
fr-ch 44 5865 418836 0 22992 3198 0
fra 0 518 1142 0 0 748 0
gr 0 498 1144 0 0 627 49
hk 16 40 24 0 4 96 0
hr 0 498 1144 0 0 613 49
hu 2 502 1156 0 0 689 49
id 0 54 0 0 0 411 42
... ... ... ... ... ... ... ...
it-ch 44 4975 382539 0 4690 3445 0
jp 0 4800 380232 28 6361 2033 49
lp 0 27 0 0 0 286 0
lt 0 503 1144 0 0 613 49
lu 44 5912 411604 0 20186 3405 0
lv 0 498 1144 0 0 613 49
me 0 0 0 0 0 4 0
mt 0 498 1144 0 0 613 49
my 0 88 20 0 0 370 0
na 12 16152 7944 0 5975 66317 2361
nl 70 6941 424316 0 89264 3949 0
no 44 8583 379647 0 115944 4029 0
ph 0 25 0 0 0 180 0
pl 0 0 0 0 0 6452 0
por 0 458 1175 0 0 716 0
pt 18 4 14 0 42 146 0
ro 0 40 1538 0 0 686 0
ru 18 74 30 2 30 6674 8
se 44 9876 398915 0 99490 4444 0
sg 44 8425 381317 28 120609 2496 49
si 0 498 1144 0 0 613 49
sk 0 498 1144 0 0 637 49
tr 0 0 0 0 0 10 0
tw 16 42 24 0 4 98 0
uk 46 17900 383687 203 129787 4695 49
vn 0 47 10 0 0 491 0
we 0 0 0 0 0 24 0
za 44 9467 381062 28 121691 3089 49
zh 0 1421 380481 28 817 2241 49
zh-tw 0 0 0 0 0 2417 0

63 rows × 7 columns

In [8]:
len(texts)
Out[8]:
380683
In [28]:
set(result_df.language.values)
Out[28]:
{'ae',
 'ar',
 'ar-ae',
 'at',
 'au',
 'bg',
 'ch',
 'cn',
 'cy',
 'cz',
 'de',
 'deu',
 'dk',
 'en',
 'en-GB',
 'en-ch',
 'es',
 'fi',
 'fr',
 'fr-ch',
 'hk',
 'hu',
 'id',
 'is',
 'it',
 'it-ch',
 'jp',
 'lp',
 'lu',
 'lv',
 'na',
 'nl',
 'no',
 'ph',
 'pl',
 'pt',
 'r1',
 'ro',
 'ru',
 'se',
 'sg',
 'tr',
 'tw',
 'uk',
 'za',
 'zh',
 'zh-tw'}
In [14]:
cv = CountVectorizer(max_features=1000)
cv_fit=cv.fit_transform(random.sample(texts, 380000))
word_array = cv_fit.toarray()
In [10]:
word_array.shape
Out[10]:
(380000, 1000)
In [15]:
word_names = cv.get_feature_names()
In [16]:
word_popularity = word_array.sum(axis=0)
In [17]:
word_popularity = np.vstack((word_popularity,word_names))
In [14]:
word_popularity
Out[14]:
array([['297', '1134', '162', ..., '135', '189', '161'],
       ['10', '100', '1000', ..., 'zdp', 'zealand', 'zinc']], 
      dtype='<U21')
In [18]:
word_popularity_df = pd.DataFrame(word_popularity.T)
word_popularity_df.columns = ['word_popularity','word']
word_popularity_df[['word_popularity']] = word_popularity_df[['word_popularity']].apply(pd.to_numeric)
word_popularity_df['word_popularity'] = word_popularity_df['word_popularity'].apply(lambda x: np.sqrt(x))
word_popularity_df[['word_popularity']] = word_popularity_df[['word_popularity']].apply(np.ceil)
    
word_popularity_df.sort_values(by='word_popularity', ascending=False)
Out[18]:
word_popularity word
174 619 ch
297 617 en
439 616 ig
816 612 shares
448 253 inc
952 236 us
541 225 limited
562 222 ltd
700 216 plc
941 185 uk
318 184 etf
405 170 group
220 148 corp
940 148 ucits
496 147 ishares
427 139 holdings
90 126 au
934 122 trust
761 113 resources
44 112 ag
778 110 sa
207 107 company
190 106 co
370 104 fund
300 103 energy
616 100 msci
223 99 corporation
477 96 international
425 95 hk
735 94 public
... ... ...
164 12 car
652 12 odier
281 12 eafe
905 12 templeton
906 12 ten
402 12 greece
73 12 animal
747 12 re
887 12 surgical
390 12 gmbh
388 12 glass
646 12 norwegian
110 12 base
769 12 rnc
375 12 garden
108 12 bar
107 12 bankshares
378 12 gcp
871 12 stone
380 12 generation
873 12 store
641 12 nomura
642 12 nor
765 12 reuters
382 12 genetics
383 12 geo
384 12 ger
881 12 suisse
294 12 emperor
698 12 plains

1000 rows × 2 columns

In [26]:
core_text = ''
for word in list(word_popularity_df['word'].values):
    for x in range(0, word_popularity_df[word_popularity_df['word']== word].word_popularity.values[0].astype(np.int64)):
        core_text = core_text + ' ' + word


title = 'English sitemaps wordcloud'
plt.figure(figsize=(30,15))
wc = wordcloud.WordCloud(background_color='white', width=1200, height=600, max_font_size=100, max_words=400).generate(core_text)
wc.recolor(random_state=0)
plt.imshow(wc)
plt.title(title, fontsize=30)
plt.axis("off")
Out[26]:
(-0.5, 1199.5, 599.5, -0.5)