import selenium import os, warnings import urllib.request from time import sleep import pandas as pd, ssl from selenium import webdriver from urllib.error import HTTPError warnings.simplefilter("ignore", UserWarning) warnings.simplefilter("ignore", FutureWarning) pd.options.display.float_format = '{:.2f}'.format ssl._create_default_https_context = ssl._create_unverified_context def fetch_wikis_codes(): try: urls = [r'https://en.wikipedia.org/wiki/Statistics_of_Wikipedias', r'https://meta.wikimedia.org/wiki/List_of_Wikipedias'] for url in urls: try: tables = pd.read_html(url) except urllib.error.HTTPError: continue for i in range(len(tables)): dataframe = tables[i] columns = list(dataframe.columns.values) if(set(['Language', 'Wiki']).issubset(set(columns))): wikis_codes = tables[i] break wikis_codes = wikis_codes[['Wiki', 'Language']] wikis_codes = wikis_codes[wikis_codes["Language"].str.contains("(closed)") == False] wikis_codes = wikis_codes.set_index('Wiki').to_dict()['Language'] return wikis_codes except: wikis_codes = {'en': 'English', 'ceb': 'Cebuano', 'de': 'German', 'sv': 'Swedish', 'fr': 'French', 'nl': 'Dutch', 'ru': 'Russian', 'es': 'Spanish', 'it': 'Italian', 'arz': 'Egyptian Arabic', 'pl': 'Polish', 'ja': 'Japanese', 'zh': 'Chinese', 'vi': 'Vietnamese', 'uk': 'Ukrainian', 'war': 'Waray', 'ar': 'Arabic', 'pt': 'Portuguese', 'fa': 'Persian', 'ca': 'Catalan', 'sr': 'Serbian', 'id': 'Indonesian', 'ko': 'Korean', 'no': 'Norwegian (Bokmål)', 'ce': 'Chechen', 'fi': 'Finnish', 'cs': 'Czech', 'tr': 'Turkish', 'hu': 'Hungarian', 'tt': 'Tatar', 'sh': 'Serbo-Croatian', 'ro': 'Romanian', 'zh-min-nan': 'Southern Min', 'eu': 'Basque', 'ms': 'Malay', 'eo': 'Esperanto', 'he': 'Hebrew', 'hy': 'Armenian', 'da': 'Danish', 'bg': 'Bulgarian', 'cy': 'Welsh', 'sk': 'Slovak', 'azb': 'South Azerbaijani', 'uz': 'Uzbek', 'et': 'Estonian', 'simple': 'Simple English', 'be': 'Belarusian', 'kk': 'Kazakh', 'min': 'Minangkabau', 'el': 'Greek', 'hr': 'Croatian', 'lt': 'Lithuanian', 'gl': 'Galician', 'az': 'Azerbaijani', 'ur': 'Urdu', 'sl': 'Slovene', 'lld': 'Ladin', 'ka': 'Georgian', 'nn': 'Norwegian (Nynorsk)', 'hi': 'Hindi', 'th': 'Thai', 'ta': 'Tamil', 'bn': 'Bengali', 'la': 'Latin', 'mk': 'Macedonian', 'zh-yue': 'Cantonese', 'ast': 'Asturian', 'lv': 'Latvian', 'af': 'Afrikaans', 'tg': 'Tajik', 'my': 'Burmese', 'mg': 'Malagasy', 'mr': 'Marathi', 'sq': 'Albanian', 'bs': 'Bosnian', 'oc': 'Occitan', 'te': 'Telugu', 'ml': 'Malayalam', 'nds': 'Low German', 'be-tarask': 'Belarusian (Taraškievica)', 'br': 'Breton', 'ky': 'Kyrgyz', 'sw': 'Swahili', 'jv': 'Javanese', 'lmo': 'Lombard', 'new': 'Newar', 'pnb': 'Western Punjabi', 'vec': 'Venetian', 'ht': 'Haitian Creole', 'pms': 'Piedmontese', 'ba': 'Bashkir', 'lb': 'Luxembourgish', 'su': 'Sundanese', 'ku': 'Kurdish (Kurmanji)', 'ga': 'Irish', 'szl': 'Silesian', 'is': 'Icelandic', 'fy': 'West Frisian', 'cv': 'Chuvash', 'ckb': 'Kurdish (Sorani)', 'pa': 'Punjabi', 'tl': 'Tagalog', 'an': 'Aragonese', 'wuu': 'Wu Chinese', 'diq': 'Zaza', 'io': 'Ido', 'sco': 'Scots', 'vo': 'Volapük', 'yo': 'Yoruba', 'ne': 'Nepali', 'ia': 'Interlingua', 'kn': 'Kannada', 'gu': 'Gujarati', 'als': 'Alemannic German', 'ha': 'Hausa', 'avk': 'Kotava', 'bar': 'Bavarian', 'crh': 'Crimean Tatar', 'scn': 'Sicilian', 'bpy': 'Bishnupriya Manipuri', 'qu': 'Quechua (Southern Quechua)', 'nv': 'Navajo', 'mn': 'Mongolian', 'xmf': 'Mingrelian', 'ban': 'Balinese', 'si': 'Sinhala', 'tum': 'Tumbuka', 'ps': 'Pashto', 'frr': 'North Frisian', 'os': 'Ossetian', 'mzn': 'Mazanderani', 'bat-smg': 'Samogitian', 'or': 'Odia', 'ig': 'Igbo', 'sah': 'Yakut', 'cdo': 'Eastern Min', 'gd': 'Scottish Gaelic', 'bug': 'Buginese', 'yi': 'Yiddish', 'sd': 'Sindhi', 'ilo': 'Ilocano', 'am': 'Amharic', 'nap': 'Neapolitan', 'li': 'Limburgish', 'bcl': 'Central Bikol', 'fo': 'Faroese', 'gor': 'Gorontalo', 'hsb': 'Upper Sorbian', 'map-bms': 'Banyumasan', 'mai': 'Maithili', 'shn': 'Shan', 'eml': 'Emilian-Romagnol', 'ace': 'Acehnese', 'zh-classical': 'Classical Chinese', 'sa': 'Sanskrit', 'as': 'Assamese', 'wa': 'Walloon', 'ie': 'Interlingue', 'hyw': 'Western Armenian', 'lij': 'Ligurian', 'mhr': 'Meadow Mari', 'zu': 'Zulu', 'sn': 'Shona', 'hif': 'Fiji Hindi', 'mrj': 'Hill Mari', 'bjn': 'Banjarese', 'mni': 'Meitei', 'km': 'Khmer', 'hak': 'Hakka Chinese', 'roa-tara': 'Tarantino', 'pam': 'Kapampangan', 'sat': 'Santali', 'rue': 'Rusyn', 'nso': 'Northern Sotho', 'bh': 'Bihari (Bhojpuri)', 'so': 'Somali', 'mi': 'Māori', 'se': 'Northern Sámi', 'myv': 'Erzya', 'vls': 'West Flemish', 'nds-nl': 'Dutch Low Saxon', 'dag': 'Dagbani', 'sc': 'Sardinian', 'ary': 'Moroccan Arabic', 'co': 'Corsican', 'kw': 'Cornish', 'bo': 'Lhasa Tibetan', 'vep': 'Veps', 'glk': 'Gilaki', 'tk': 'Turkmen', 'kab': 'Kabyle', 'gan': 'Gan Chinese', 'rw': 'Kinyarwanda', 'fiu-vro': 'Võro', 'ab': 'Abkhaz', 'gv': 'Manx', 'ug': 'Uyghur', 'nah': 'Nahuatl', 'zea': 'Zeelandic', 'skr': 'Saraiki', 'frp': 'Franco-Provençal', 'udm': 'Udmurt', 'pcd': 'Picard', 'mt': 'Maltese', 'kv': 'Komi', 'csb': 'Kashubian', 'gn': 'Guarani', 'smn': 'Inari Sámi', 'ay': 'Aymara', 'nrm': 'Norman', 'ks': 'Kashmiri', 'lez': 'Lezgian', 'lfn': 'Lingua Franca Nova', 'olo': 'Livvi-Karelian', 'mwl': 'Mirandese', 'stq': 'Saterland Frisian', 'lo': 'Lao', 'ang': 'Old English', 'mdf': 'Moksha', 'fur': 'Friulian', 'rm': 'Romansh', 'lad': 'Judaeo-Spanish', 'kaa': 'Karakalpak', 'gom': 'Konkani (Goan Konkani)', 'ext': 'Extremaduran', 'koi': 'Permyak', 'tyv': 'Tuvan', 'pap': 'Papiamento', 'av': 'Avar', 'dsb': 'Lower Sorbian', 'ln': 'Lingala', 'dty': 'Doteli', 'tw': 'Twi', 'cbk-zam': 'Chavacano (Zamboanga)', 'dv': 'Maldivian', 'ksh': 'Ripuarian', 'za': 'Zhuang (Standard Zhuang)', 'gag': 'Gagauz', 'bxr': 'Buryat (Russia Buriat)', 'pfl': 'Palatine German', 'lg': 'Luganda', 'szy': 'Sakizaya', 'pag': 'Pangasinan', 'blk': "Pa'O", 'pi': 'Pali', 'tay': 'Atayal', 'haw': 'Hawaiian', 'awa': 'Awadhi', 'inh': 'Ingush', 'krc': 'Karachay-Balkar', 'xal': 'Kalmyk Oirat', 'pdc': 'Pennsylvania Dutch', 'to': 'Tongan', 'atj': 'Atikamekw', 'tcy': 'Tulu', 'arc': 'Aramaic (Syriac)', 'mnw': 'Mon', 'jam': 'Jamaican Patois', 'shi': 'Shilha', 'kbp': 'Kabiye', 'wo': 'Wolof', 'anp': 'Angika', 'kbd': 'Kabardian', 'nia': 'Nias', 'nov': 'Novial', 'om': 'Oromo', 'ki': 'Kikuyu', 'nqo': "N'Ko", 'bi': 'Bislama', 'xh': 'Xhosa', 'tpi': 'Tok Pisin', 'tet': 'Tetum', 'ff': 'Fula', 'roa-rup': 'Aromanian', 'jbo': 'Lojban', 'fj': 'Fijian', 'kg': 'Kongo (Kituba)', 'lbe': 'Lak', 'ty': 'Tahitian', 'guw': 'Gun', 'cu': 'Old Church Slavonic', 'trv': 'Seediq', 'ami': 'Amis', 'srn': 'Sranan Tongo', 'sm': 'Samoan', 'mad': 'Madurese', 'alt': 'Southern Altai', 'ltg': 'Latgalian', 'gcr': 'French Guianese Creole', 'chr': 'Cherokee', 'tn': 'Tswana', 'ny': 'Chewa', 'st': 'Sotho', 'pih': 'Norfuk', 'rmy': 'Romani (Vlax Romani)', 'got': 'Gothic', 'ee': 'Ewe', 'pcm': 'Nigerian Pidgin', 'bm': 'Bambara', 'ss': 'Swazi', 'ts': 'Tsonga', 've': 'Venda', 'kcg': 'Tyap', 'chy': 'Cheyenne', 'rn': 'Kirundi', 'ch': 'Chamorro', 'gur': 'Frafra', 'ik': 'Iñupiaq', 'ady': 'Adyghe', 'pnt': 'Pontic Greek', 'guc': 'Wayuu', 'iu': 'Inuktitut', 'pwn': 'Paiwan', 'sg': 'Sango', 'din': 'Dinka', 'ti': 'Tigrinya', 'kl': 'Greenlandic', 'dz': 'Dzongkha', 'cr': 'Cree', 'ak': 'Akan'} return wikis_codes def fetch_wiki_metadata(wiki, metric, submetric, timeout): options = webdriver.FirefoxOptions() options.add_argument("--headless") profile = webdriver.FirefoxProfile() profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference("browser.download.dir", f"{os.getcwd()}") profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") driver = webdriver.Firefox(options=options, firefox_profile=profile, executable_path='geckodriver', service_log_path=os.devnull) if metric == 'pages': base_url = f'https://stats.wikimedia.org/#/{wiki}.wikipedia.org/content/pages-to-date/full|table|' elif metric == 'edits': base_url = f'https://stats.wikimedia.org/#/{wiki}.wikipedia.org/contributing/edits/full|table|' parameters = f'1-month|editor_type~anonymous*group-bot*name-bot*user+(page_type)~{submetric}|monthly' request_url = "".join([base_url, parameters]) driver.implicitly_wait(3) driver.get(request_url) driver.page_source sleep(timeout) csvFilename = f"{wiki}--{metric}--{submetric}.csv" csvFilename = csvFilename.replace(' ','-') driver.find_element_by_class_name("ui.icon.button.tooltipped.tooltipped-n").click() sleep(3) ; os.rename("undefined.csv", csvFilename) driver.close() driver.quit() print(f' [+] Metadata Exported to `{wiki}/{csvFilename}`.') return csvFilename wiki_codes = fetch_wikis_codes() labels = [] for key, value in wiki_codes.items(): labels.append(f"{value} ({key})") wikis = list(wiki_codes.keys()) metrics = ['pages', 'edits'] submetrics = ['content', 'non-content'] timeout = 3 counter = 1 for wiki in wikis: print(f'{counter}## {wiki_codes[wiki]} Wikipedia Files:') if not os.path.exists(f'{wiki}'): os.makedirs(f'{wiki}') if not os.path.exists('all-metadata'): os.makedirs('all-metadata') for metric in metrics: for submetric in submetrics: try: csvFilename = fetch_wiki_metadata(wiki, metric, submetric, timeout) dataframe = pd.read_csv(csvFilename).iloc[-1] except selenium.common.exceptions.ElementClickInterceptedException: dataframe = pd.read_csv(fetch_wiki_metadata(wiki, metric, submetric, timeout*2)).iloc[-1] timeout *= 2 retrieval_date = pd.to_datetime(dataframe['timeRange.end']).strftime('%Y-%m-%d') if metric == 'pages': if submetric == 'content': pages_content_bots = dataframe['total.group-bot']+dataframe['total.name-bot'] pages_content_humans = dataframe['total.user']+dataframe['total.anonymous'] elif submetric == 'non-content': pages_non_content_bots = dataframe['total.group-bot']+dataframe['total.name-bot'] pages_non_content_humans = dataframe['total.user']+dataframe['total.anonymous'] else: print(f'Error: this submetric: {submetric} is not supported!') elif metric == 'edits': if submetric == 'content': edits_content_bots = dataframe['total.group-bot']+dataframe['total.name-bot'] edits_content_humans = dataframe['total.user']+dataframe['total.anonymous'] elif submetric == 'non-content': edits_non_content_bots = dataframe['total.group-bot']+dataframe['total.name-bot'] edits_non_content_humans = dataframe['total.user']+dataframe['total.anonymous'] else: print(f'Error: this submetric: {submetric} is not supported!') else: print(f'Error: this metric: {metric} is not supported!') os.system(f'mv {wiki}--{metric}--{submetric}.csv {wiki}/{wiki}--{metric}--{submetric}.csv') selected_language = f'{wiki_codes[wiki]} ({wiki})' metadata = {'Wiki' : [selected_language, selected_language, selected_language, selected_language, selected_language, selected_language, selected_language,selected_language], 'Metric' : ['Pages', 'Pages', 'Pages', 'Pages', 'Edits', 'Edits', 'Edits', 'Edits'], 'Sub-Metric' : ['Articles', 'Articles', 'Non-Articles', 'Non-Articles', 'Articles', 'Articles', 'Non-Articles', 'Non-Articles'], 'Editors' : ['Bots', 'Humans', 'Bots', 'Humans', 'Bots', 'Humans', 'Bots', 'Humans'], 'Values' : [pages_content_bots, pages_content_humans, pages_non_content_bots, pages_non_content_humans, edits_content_bots, edits_content_humans, edits_non_content_bots, edits_non_content_humans]} wiki_metadata = pd.DataFrame(metadata) wiki_metadata['Retrieval-Date'] = retrieval_date wiki_metadata.to_csv(f'{wiki_codes[wiki].replace(" ","-")}--Wikipedia--Metadata.csv', index=False) os.system(f'mv {wiki} all-metadata/') counter = counter + 1 sleep(1)