MistriDevLab

Running

App Files Files Community

MistriDevLab / dictionary.py

acecalisto3

Rename gensim/corpora/dictionary.py to dictionary.py

7657f5a verified about 2 months ago

raw

history blame contribute delete

No virus

30.3 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
	# Licensed under the GNU LGPL v2.1 - https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html

	"""This module implements the concept of a Dictionary -- a mapping between words and their integer ids."""

	from collections import defaultdict
	from collections.abc import Mapping
	import logging
	import itertools
	from typing import Optional, List, Tuple

	from gensim import utils


	logger = logging.getLogger(__name__)


	class Dictionary(utils.SaveLoad, Mapping):
	"""Dictionary encapsulates the mapping between normalized words and their integer ids.

	Notable instance attributes:

	Attributes
	----------
	token2id : dict of (str, int)
	token -> token_id. I.e. the reverse mapping to `self[token_id]`.
	cfs : dict of (int, int)
	Collection frequencies: token_id -> how many instances of this token are contained in the documents.
	dfs : dict of (int, int)
	Document frequencies: token_id -> how many documents contain this token.
	num_docs : int
	Number of documents processed.
	num_pos : int
	Total number of corpus positions (number of processed words).
	num_nnz : int
	Total number of non-zeroes in the BOW matrix (sum of the number of unique
	words per document over the entire corpus).

	"""
	def __init__(self, documents=None, prune_at=2000000):
	"""

	Parameters
	----------
	documents : iterable of iterable of str, optional
	Documents to be used to initialize the mapping and collect corpus statistics.
	prune_at : int, optional
	Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM
	footprint, the correctness is not guaranteed.
	Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>>
	>>> texts = [['human', 'interface', 'computer']]
	>>> dct = Dictionary(texts) # initialize a Dictionary
	>>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # add more document (extend the vocabulary)
	>>> dct.doc2bow(["dog", "computer", "non_existent_word"])
	[(0, 1), (6, 1)]

	"""
	self.token2id = {}
	self.id2token = {}
	self.cfs = {}
	self.dfs = {}

	self.num_docs = 0
	self.num_pos = 0
	self.num_nnz = 0

	if documents is not None:
	self.add_documents(documents, prune_at=prune_at)
	self.add_lifecycle_event(
	"created",
	msg=f"built {self} from {self.num_docs} documents (total {self.num_pos} corpus positions)",
	)

	def __getitem__(self, tokenid):
	"""Get the string token that corresponds to `tokenid`.

	Parameters
	----------
	tokenid : int
	Id of token.

	Returns
	-------
	str
	Token corresponding to `tokenid`.

	Raises
	------
	KeyError
	If this Dictionary doesn't contain such `tokenid`.

	"""
	if len(self.id2token) != len(self.token2id):
	# the word->id mapping has changed (presumably via add_documents);
	# recompute id->word accordingly
	self.id2token = utils.revdict(self.token2id)
	return self.id2token[tokenid] # will throw for non-existent ids

	def __iter__(self):
	"""Iterate over all tokens."""
	return iter(self.keys())

	# restore Py2-style dict API
	iterkeys = __iter__

	def iteritems(self):
	return self.items()

	def itervalues(self):
	return self.values()

	def keys(self):
	"""Get all stored ids.

	Returns
	-------
	list of int
	List of all token ids.

	"""
	return list(self.token2id.values())

	def __len__(self):
	"""Get number of stored tokens.

	Returns
	-------
	int
	Number of stored tokens.

	"""
	return len(self.token2id)

	def __str__(self):
	some_keys = list(itertools.islice(self.token2id.keys(), 5))
	return "%s<%i unique tokens: %s%s>" % (
	self.__class__.__name__, len(self), some_keys, '...' if len(self) > 5 else ''
	)

	@staticmethod
	def from_documents(documents):
	"""Create :class:`~gensim.corpora.dictionary.Dictionary` from `documents`.

	Equivalent to `Dictionary(documents=documents)`.

	Parameters
	----------
	documents : iterable of iterable of str
	Input corpus.

	Returns
	-------
	:class:`~gensim.corpora.dictionary.Dictionary`
	Dictionary initialized from `documents`.

	"""
	return Dictionary(documents=documents)

	def add_documents(self, documents, prune_at=2000000):
	"""Update dictionary from a collection of `documents`.

	Parameters
	----------
	documents : iterable of iterable of str
	Input corpus. All tokens should be already tokenized and normalized.
	prune_at : int, optional
	Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM
	footprint, the correctness is not guaranteed.
	Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>>
	>>> corpus = ["máma mele maso".split(), "ema má máma".split()]
	>>> dct = Dictionary(corpus)
	>>> len(dct)
	5
	>>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]])
	>>> len(dct)
	10

	"""
	for docno, document in enumerate(documents):
	# log progress & run a regular check for pruning, once every 10k docs
	if docno % 10000 == 0:
	if prune_at is not None and len(self) > prune_at:
	self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
	logger.info("adding document #%i to %s", docno, self)

	# update Dictionary with the document
	self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids

	logger.info("built %s from %i documents (total %i corpus positions)", self, self.num_docs, self.num_pos)

	def doc2bow(self, document, allow_update=False, return_missing=False):
	"""Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples.

	Parameters
	----------
	document : list of str
	Input document.
	allow_update : bool, optional
	Update self, by adding new tokens from `document` and updating internal corpus statistics.
	return_missing : bool, optional
	Return missing tokens (tokens present in `document` but not in self) with frequencies?

	Return
	------
	list of (int, int)
	BoW representation of `document`.
	list of (int, int), dict of (str, int)
	If `return_missing` is True, return BoW representation of `document` + dictionary with missing
	tokens and their frequencies.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
	>>> dct.doc2bow(["this", "is", "máma"])
	[(2, 1)]
	>>> dct.doc2bow(["this", "is", "máma"], return_missing=True)
	([(2, 1)], {u'this': 1, u'is': 1})

	"""
	if isinstance(document, str):
	raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")

	# Construct (word, frequency) mapping.
	counter = defaultdict(int)
	for w in document:
	counter[w if isinstance(w, str) else str(w, 'utf-8')] += 1

	token2id = self.token2id
	if allow_update or return_missing:
	missing = sorted(x for x in counter.items() if x[0] not in token2id)
	if allow_update:
	for w, _ in missing:
	# new id = number of ids made so far;
	# NOTE this assumes there are no gaps in the id sequence!
	token2id[w] = len(token2id)
	result = {token2id[w]: freq for w, freq in counter.items() if w in token2id}

	if allow_update:
	self.num_docs += 1
	self.num_pos += sum(counter.values())
	self.num_nnz += len(result)
	# keep track of document and collection frequencies
	for tokenid, freq in result.items():
	self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
	self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

	# return tokenids, in ascending id order
	result = sorted(result.items())
	if return_missing:
	return result, dict(missing)
	else:
	return result

	def doc2idx(self, document, unknown_word_index=-1):
	"""Convert `document` (a list of words) into a list of indexes = list of `token_id`.
	Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`.

	Parameters
	----------
	document : list of str
	Input document
	unknown_word_index : int, optional
	Index to use for words not in the dictionary.

	Returns
	-------
	list of int
	Token ids for tokens in `document`, in the same order.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>>
	>>> corpus = [["a", "a", "b"], ["a", "c"]]
	>>> dct = Dictionary(corpus)
	>>> dct.doc2idx(["a", "a", "c", "not_in_dictionary", "c"])
	[0, 0, 2, -1, 2]

	"""
	if isinstance(document, str):
	raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string")

	document = [word if isinstance(word, str) else str(word, 'utf-8') for word in document]
	return [self.token2id.get(word, unknown_word_index) for word in document]

	def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
	"""Filter out tokens in the dictionary by their frequency.

	Parameters
	----------
	no_below : int, optional
	Keep tokens which are contained in at least `no_below` documents.
	no_above : float, optional
	Keep tokens which are contained in no more than `no_above` documents
	(fraction of total corpus size, not an absolute number).
	keep_n : int, optional
	Keep only the first `keep_n` most frequent tokens.
	keep_tokens : iterable of str
	Iterable of tokens that must stay in dictionary after filtering.

	Notes
	-----
	This removes all tokens in the dictionary that are:

	#. Less frequent than `no_below` documents (absolute number, e.g. `5`) or \n
	#. More frequent than `no_above` documents (fraction of the total corpus size, e.g. `0.3`).
	#. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `keep_n=None`).

	After the pruning, resulting gaps in word ids are shrunk.
	Due to this gap shrinking, **the same word may have a different word id before and after the call
	to this function!** See :class:`gensim.models.VocabTransform` and the
	`dedicated FAQ entry <https://github.com/RaRe-Technologies/gensim/wiki/Recipes-&-FAQ#q8-how-can-i-filter-a-saved-corpus-and-its-corresponding-dictionary>`_ on how # noqa
	to transform a corpus built with a dictionary before pruning.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>>
	>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
	>>> dct = Dictionary(corpus)
	>>> len(dct)
	5
	>>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1)
	>>> len(dct)
	1

	"""
	no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold

	# determine which tokens to keep
	if keep_tokens:
	keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id}
	good_ids = [
	v for v in self.token2id.values()
	if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids
	]
	good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True)
	else:
	good_ids = [
	v for v in self.token2id.values()
	if no_below <= self.dfs.get(v, 0) <= no_above_abs
	]
	good_ids.sort(key=self.dfs.get, reverse=True)
	if keep_n is not None:
	good_ids = good_ids[:keep_n]
	bad_words = [(self[idx], self.dfs.get(idx, 0)) for idx in set(self).difference(good_ids)]
	logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10])
	logger.info(
	"keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
	len(good_ids), no_below, no_above_abs, 100.0 * no_above
	)

	# do the actual filtering, then rebuild dictionary to remove gaps in ids
	self.filter_tokens(good_ids=good_ids)
	logger.info("resulting dictionary: %s", self)

	def filter_n_most_frequent(self, remove_n):
	"""Filter out the 'remove_n' most frequent tokens that appear in the documents.

	Parameters
	----------
	remove_n : int
	Number of the most frequent tokens that will be removed.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>>
	>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
	>>> dct = Dictionary(corpus)
	>>> len(dct)
	5
	>>> dct.filter_n_most_frequent(2)
	>>> len(dct)
	3

	"""
	# determine which tokens to keep
	most_frequent_ids = (v for v in self.token2id.values())
	most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True)
	most_frequent_ids = most_frequent_ids[:remove_n]
	# do the actual filtering, then rebuild dictionary to remove gaps in ids
	most_frequent_words = [(self[idx], self.dfs.get(idx, 0)) for idx in most_frequent_ids]
	logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10])

	self.filter_tokens(bad_ids=most_frequent_ids)
	logger.info("resulting dictionary: %s", self)

	def filter_tokens(self, bad_ids=None, good_ids=None):
	"""Remove the selected `bad_ids` tokens from :class:`~gensim.corpora.dictionary.Dictionary`.

	Alternatively, keep selected `good_ids` in :class:`~gensim.corpora.dictionary.Dictionary` and remove the rest.

	Parameters
	----------
	bad_ids : iterable of int, optional
	Collection of word ids to be removed.
	good_ids : collection of int, optional
	Keep selected collection of word ids and remove the rest.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>>
	>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
	>>> dct = Dictionary(corpus)
	>>> 'ema' in dct.token2id
	True
	>>> dct.filter_tokens(bad_ids=[dct.token2id['ema']])
	>>> 'ema' in dct.token2id
	False
	>>> len(dct)
	4
	>>> dct.filter_tokens(good_ids=[dct.token2id['maso']])
	>>> len(dct)
	1

	"""
	if bad_ids is not None:
	bad_ids = set(bad_ids)
	self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid not in bad_ids}
	self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid not in bad_ids}
	self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid not in bad_ids}
	if good_ids is not None:
	good_ids = set(good_ids)
	self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid in good_ids}
	self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid in good_ids}
	self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid in good_ids}
	self.compactify()

	def compactify(self):
	"""Assign new word ids to all words, shrinking any gaps."""
	logger.debug("rebuilding dictionary, shrinking gaps")

	# build mapping from old id -> new id
	idmap = dict(zip(sorted(self.token2id.values()), range(len(self.token2id))))

	# reassign mappings to new ids
	self.token2id = {token: idmap[tokenid] for token, tokenid in self.token2id.items()}
	self.id2token = {}
	self.dfs = {idmap[tokenid]: freq for tokenid, freq in self.dfs.items()}
	self.cfs = {idmap[tokenid]: freq for tokenid, freq in self.cfs.items()}

	def save_as_text(self, fname, sort_by_word=True):
	"""Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.

	Parameters
	----------
	fname : str
	Path to output file.
	sort_by_word : bool, optional
	Sort words in lexicographical order before writing them out?

	Notes
	-----
	Format::

	num_docs
	id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE]
	id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE]
	....
	id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE]

	This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable
	to other tools and frameworks. For better performance and to store the entire object state,
	including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and
	:meth:`~gensim.corpora.dictionary.Dictionary.load` instead.

	See Also
	--------
	:meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`
	Load :class:`~gensim.corpora.dictionary.Dictionary` from text file.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>> from gensim.test.utils import get_tmpfile
	>>>
	>>> tmp_fname = get_tmpfile("dictionary")
	>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
	>>>
	>>> dct = Dictionary(corpus)
	>>> dct.save_as_text(tmp_fname)
	>>>
	>>> loaded_dct = Dictionary.load_from_text(tmp_fname)
	>>> assert dct.token2id == loaded_dct.token2id

	"""
	logger.info("saving dictionary mapping to %s", fname)
	with utils.open(fname, 'wb') as fout:
	numdocs_line = "%d\n" % self.num_docs
	fout.write(utils.to_utf8(numdocs_line))
	if sort_by_word:
	for token, tokenid in sorted(self.token2id.items()):
	line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
	fout.write(utils.to_utf8(line))
	else:
	for tokenid, freq in sorted(self.dfs.items(), key=lambda item: -item[1]):
	line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
	fout.write(utils.to_utf8(line))

	def merge_with(self, other):
	"""Merge another dictionary into this dictionary, mapping the same tokens to the same ids
	and new tokens to new ids.

	Notes
	-----
	The purpose is to merge two corpora created using two different dictionaries: `self` and `other`.
	`other` can be any id=>word mapping (a dict, a Dictionary object, ...).

	Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents
	from a corpus built using the `other` dictionary into a document using the new, merged dictionary.

	Parameters
	----------
	other : {dict, :class:`~gensim.corpora.dictionary.Dictionary`}
	Other dictionary.

	Return
	------
	:class:`gensim.models.VocabTransform`
	Transformation object.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>>
	>>> corpus_1, corpus_2 = [["a", "b", "c"]], [["a", "f", "f"]]
	>>> dct_1, dct_2 = Dictionary(corpus_1), Dictionary(corpus_2)
	>>> dct_1.doc2bow(corpus_2[0])
	[(0, 1)]
	>>> transformer = dct_1.merge_with(dct_2)
	>>> dct_1.doc2bow(corpus_2[0])
	[(0, 1), (3, 2)]

	"""
	old2new = {}
	for other_id, other_token in other.items():
	if other_token in self.token2id:
	new_id = self.token2id[other_token]
	else:
	new_id = len(self.token2id)
	self.token2id[other_token] = new_id
	self.dfs[new_id] = 0
	old2new[other_id] = new_id
	try:
	self.dfs[new_id] += other.dfs[other_id]
	except Exception:
	# `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going
	pass
	try:
	self.num_docs += other.num_docs
	self.num_nnz += other.num_nnz
	self.num_pos += other.num_pos
	except Exception:
	pass

	import gensim.models
	return gensim.models.VocabTransform(old2new)

	def patch_with_special_tokens(self, special_token_dict):
	"""Patch token2id and id2token using a dictionary of special tokens.


	Usecase: when doing sequence modeling (e.g. named entity recognition), one may want to specify
	special tokens that behave differently than others.
	One example is the "unknown" token, and another is the padding token.
	It is usual to set the padding token to have index `0`, and patching the dictionary with `{'<PAD>': 0}`
	would be one way to specify this.

	Parameters
	----------
	special_token_dict : dict of (str, int)
	dict containing the special tokens as keys and their wanted indices as values.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>>
	>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
	>>> dct = Dictionary(corpus)
	>>>
	>>> special_tokens = {'pad': 0, 'space': 1}
	>>> print(dct.token2id)
	{'maso': 0, 'mele': 1, 'máma': 2, 'ema': 3, 'má': 4}
	>>>
	>>> dct.patch_with_special_tokens(special_tokens)
	>>> print(dct.token2id)
	{'maso': 6, 'mele': 7, 'máma': 2, 'ema': 3, 'má': 4, 'pad': 0, 'space': 1}

	"""
	possible_ids = []
	for token, idx in special_token_dict.items():
	if token in self.token2id and self.token2id[token] == idx:
	continue
	if token in self.token2id and self.token2id[token] != idx:
	possible_ids.append(self.token2id[token])
	del self.token2id[token]
	old_token = self[idx]
	self.token2id[token] = idx
	self.token2id[old_token] = possible_ids.pop() if \
	len(possible_ids) > 0 else len(self.token2id) - 1
	self.id2token = {} # Make sure that id2token is updated according to special tokens.

	@staticmethod
	def load_from_text(fname):
	"""Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file.

	Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.

	Parameters
	----------
	fname: str
	Path to a file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.

	See Also
	--------
	:meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`
	Save :class:`~gensim.corpora.dictionary.Dictionary` to text file.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>> from gensim.test.utils import get_tmpfile
	>>>
	>>> tmp_fname = get_tmpfile("dictionary")
	>>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
	>>>
	>>> dct = Dictionary(corpus)
	>>> dct.save_as_text(tmp_fname)
	>>>
	>>> loaded_dct = Dictionary.load_from_text(tmp_fname)
	>>> assert dct.token2id == loaded_dct.token2id

	"""
	result = Dictionary()
	with utils.open(fname, 'rb') as f:
	for lineno, line in enumerate(f):
	line = utils.to_unicode(line)
	if lineno == 0:
	if line.strip().isdigit():
	# Older versions of save_as_text may not write num_docs on first line.
	result.num_docs = int(line.strip())
	continue
	else:
	logging.warning("Text does not contain num_docs on the first line.")
	try:
	wordid, word, docfreq = line[:-1].split('\t')
	except Exception:
	raise ValueError("invalid line in dictionary file %s: %s"
	% (fname, line.strip()))
	wordid = int(wordid)
	if word in result.token2id:
	raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word]))
	result.token2id[word] = wordid
	result.dfs[wordid] = int(docfreq)
	return result

	def most_common(self, n: Optional[int] = None) -> List[Tuple[str, int]]:
	"""Return a list of the n most common words and their counts from the most common to the least.

	Words with equal counts are ordered in the increasing order of their ids.

	Parameters
	----------
	n : int or None, optional
	The number of most common words to be returned. If `None`, all words in the dictionary
	will be returned. Default is `None`.

	Returns
	-------
	most_common : list of (str, int)
	The n most common words and their counts from the most common to the least.

	"""
	most_common = [
	(self[word], count)
	for word, count
	in sorted(self.cfs.items(), key=lambda x: (-x[1], x[0]))[:n]
	]
	return most_common

	@staticmethod
	def from_corpus(corpus, id2word=None):
	"""Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus.

	Parameters
	----------
	corpus : iterable of iterable of (int, number)
	Corpus in BoW format.
	id2word : dict of (int, object)
	Mapping id -> word. If None, the mapping `id2word[word_id] = str(word_id)` will be used.

	Notes
	-----
	This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original
	text corpus. This method will scan the term-document count matrix for all word ids that appear in it,
	then construct :class:`~gensim.corpora.dictionary.Dictionary` which maps each `word_id -> id2word[word_id]`.
	`id2word` is an optional dictionary that maps the `word_id` to a token.
	In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used.

	Returns
	-------
	:class:`~gensim.corpora.dictionary.Dictionary`
	Inferred dictionary from corpus.

	Examples
	--------
	.. sourcecode:: pycon

	>>> from gensim.corpora import Dictionary
	>>>
	>>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []]
	>>> dct = Dictionary.from_corpus(corpus)
	>>> len(dct)
	3

	"""
	result = Dictionary()
	max_id = -1
	for docno, document in enumerate(corpus):
	if docno % 10000 == 0:
	logger.info("adding document #%i to %s", docno, result)
	result.num_docs += 1
	result.num_nnz += len(document)
	for wordid, word_freq in document:
	max_id = max(wordid, max_id)
	result.num_pos += word_freq
	result.dfs[wordid] = result.dfs.get(wordid, 0) + 1

	if id2word is None:
	# make sure length(result) == get_max_id(corpus) + 1
	result.token2id = {str(i): i for i in range(max_id + 1)}
	else:
	# id=>word mapping given: simply copy it
	result.token2id = {utils.to_unicode(token): idx for idx, token in id2word.items()}
	for idx in result.token2id.values():
	# make sure all token ids have a valid `dfs` entry
	result.dfs[idx] = result.dfs.get(idx, 0)

	logger.info(
	"built %s from %i documents (total %i corpus positions)",
	result, result.num_docs, result.num_pos
	)
	return result