Data Exploration¶
Sanity checks on the parsed STEPBible data.
In [1]:
Copied!
import sys
sys.path.insert(0, '../../../src')
import pandas as pd
from bible_grammar.query import query, reload
reload()
import sys
sys.path.insert(0, '../../../src')
import pandas as pd
from bible_grammar.query import query, reload
reload()
In [2]:
Copied!
# Overall row counts
all_words = query()
print(f"Total words: {len(all_words):,}")
print(all_words.groupby('source').size())
# Overall row counts
all_words = query()
print(f"Total words: {len(all_words):,}")
print(all_words.groupby('source').size())
Total words: 425,454 source TAGNT 141720 TAHOT 283734 dtype: int64
In [3]:
Copied!
# Genesis 1:1 — verify morphology decoding
query(book='Gen', chapter=1, verse=1)[['word','translation','part_of_speech','stem','conjugation','gender','number','state']]
# Genesis 1:1 — verify morphology decoding
query(book='Gen', chapter=1, verse=1)[['word','translation','part_of_speech','stem','conjugation','gender','number','state']]
Out[3]:
| word | translation | part_of_speech | stem | conjugation | gender | number | state | |
|---|---|---|---|---|---|---|---|---|
| 0 | בְּ/רֵאשִׁ֖ית | in/ beginning | Noun | Feminine | Singular | Absolute | ||
| 1 | בָּרָ֣א | he created | Verb | Qal | Perfect | Masculine | Singular | |
| 2 | אֱלֹהִ֑ים | God | Noun | Masculine | Plural | Absolute | ||
| 3 | אֵ֥ת | <obj.> | Particle | |||||
| 4 | הַ/שָּׁמַ֖יִם | the/ heavens | Noun | Masculine | Plural | Absolute | ||
| 5 | וְ/אֵ֥ת | and/ <obj.> | Particle | |||||
| 6 | הָ/אָֽרֶץ\׃ | the/ earth | Noun | Feminine | Singular | Absolute |
In [4]:
Copied!
# Matthew 1:1 — verify Greek decoding
query(book='Mat', chapter=1, verse=1)[['word','translation','part_of_speech','tense','voice','mood','case_','number','gender']]
# Matthew 1:1 — verify Greek decoding
query(book='Mat', chapter=1, verse=1)[['word','translation','part_of_speech','tense','voice','mood','case_','number','gender']]
Out[4]:
| word | translation | part_of_speech | tense | voice | mood | case_ | number | gender | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Βίβλος | [The] book | Noun | Nominative | Singular | Feminine | |||
| 1 | γενέσεως | of [the] genealogy | Noun | Genitive | Singular | Feminine | |||
| 2 | Ἰησοῦ | of Jesus | Noun | Genitive | Singular | Masculine | |||
| 3 | Χριστοῦ | Christ | Noun | Genitive | Singular | Masculine | |||
| 4 | υἱοῦ | son | Noun | Genitive | Singular | Masculine | |||
| 5 | Δαυὶδ | of David | Noun | Genitive | Singular | Masculine | |||
| 6 | υἱοῦ | son | Noun | Genitive | Singular | Masculine | |||
| 7 | Ἀβραάμ. | of Abraham. | Noun | Genitive | Singular | Masculine |
In [5]:
Copied!
# Part-of-speech distribution (Hebrew)
from bible_grammar.stats import pos_distribution
pos_distribution('TAHOT')
# Part-of-speech distribution (Hebrew)
from bible_grammar.stats import pos_distribution
pos_distribution('TAHOT')
Out[5]:
| part_of_speech | count | |
|---|---|---|
| 0 | Noun | 116551 |
| 1 | Verb | 60916 |
| 2 | Suffix | 42745 |
| 3 | Particle | 30255 |
| 4 | Preposition | 10613 |
| 5 | Adjective | 8132 |
| 6 | Pronoun | 5735 |
| 7 | Adverb | 4036 |
| 8 | a | 1958 |
| 9 | c | 1489 |
| 10 | Conjunction | 679 |
| 11 | o | 612 |
| 12 | 13 |
In [6]:
Copied!
# Part-of-speech distribution (Greek)
pos_distribution('TAGNT')
# Part-of-speech distribution (Greek)
pos_distribution('TAGNT')
Out[6]:
| part_of_speech | count | |
|---|---|---|
| 0 | Noun | 29363 |
| 1 | Verb | 28517 |
| 2 | Article | 20701 |
| 3 | Conjunction | 18371 |
| 4 | Pronoun | 11518 |
| 5 | Preposition | 11461 |
| 6 | Adjective | 8536 |
| 7 | Particle | 4007 |
| 8 | Adverb | 2355 |
| 9 | D | 1756 |
| 10 | R | 1579 |
| 11 | Conditional | 926 |
| 12 | I | 616 |
| 13 | X | 542 |
| 14 | Interjection | 478 |
| 15 | F | 411 |
| 16 | S | 220 |
| 17 | K | 147 |
| 18 | C | 101 |
| 19 | Q | 27 |
| 20 | CONJ + G1565=D | 20 |
| 21 | CONJ + G1437=COND | 17 |
| 22 | CONJ + G1564=ADV | 10 |
| 23 | CONJ + G1563=ADV | 10 |
| 24 | COND + G4007=CONJ | 9 |
| 25 | CONJ + G5101=I | 6 |
| 26 | PREP + G1537=PREP + G4057=ADV | 3 |
| 27 | CONJ + G5104=PRT | 2 |
| 28 | PREP + G3029=ADV | 2 |
| 29 | PREP + G4057=ADV | 2 |
| 30 | CONJ + G5104=PRT + G1065=PRT | 1 |
| 31 | PREP + G4155=V | 1 |
| 32 | PRT + G4225=ADV | 1 |
| 33 | PREP + G4521=N | 1 |
| 34 | PREP + G2955=V | 1 |
| 35 | PREP + G0826=V | 1 |
| 36 | ADV + G3461=N | 1 |