Clean Text for Data Analysis

How to Prepare Text for NLP and Data Analysis (Tutorial)

Below is the script I use in the text preprocessing tutorial. You may also read more about Pandas and NLTK.

# If not installed then install libraries
#pip install pandas
#pip install nltk

# Import file
import pandas as pd
df = pd.read_csv(r"C:\your-folder\your-file.csv")

df.info()

df.head(10)

# Lowercase characters
df['titleselftext']=df['titleselftext'].str.lower()
df.head(10)

# Remove whitespaces
def remove_whitespaces(titleselftext):
    return  " ".join(titleselftext.split())
df['titleselftext']=df['titleselftext'].apply(remove_whitespaces)
df.head(10)

# Tokenize (creates a list of single entities e.g. words)
import nltk
nltk.download('punkt')
from nltk import word_tokenize
df['titleselftext']=df['titleselftext'].apply(lambda X: word_tokenize(X))
df.head(10)

# Remove NLTK and custom stopwords

# Load NLTK stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))
en_stopwords = stopwords.words('english')

# Load custom stopwords
with open(r'C:\your-folder\custom_stopwords.csv', 'r') as f:
    custom_stopwords = f.read().strip().split(",")
custom_stopwords[-10:]

# Remove the stopwords

def remove_stopwords(titleselftext):
    result = []
    for token in titleselftext:
        if token not in en_stopwords and token not in custom_stopwords:
            result.append(token)
            
    return result

df['titleselftext'] = df['titleselftext'].apply(remove_stopwords)
df.head(10)

# Remove punctuation
from nltk.tokenize import RegexpTokenizer

def remove_punct(titleselftext):
    
    tokenizer = RegexpTokenizer(r"\w+")
    lst=tokenizer.tokenize(' '.join(titleselftext))
    return lst

df['titleselftext'] = df['titleselftext'].apply(remove_punct)
df.head(10)

# Lemmatize
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

def lemmatization(titleselftext):
    
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(titleselftext):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    
    return result

df['titleselftext']=df['titleselftext'].apply(lemmatization)
df.head(10)

# Convert list to string
df['titleselftext'] = [' '.join(map(str, l)) for l in df['titleselftext']]
df.head(10)

#Remove words 3 characters in length and less
df['titleselftext'] = df.titleselftext.str.replace(r'\b(\w{1,3})\b', '')

# Remove whitespaces
def remove_whitespaces(titleselftext):
    return  " ".join(titleselftext.split())
df['titleselftext']=df['titleselftext'].apply(remove_whitespaces)
df.head(10)

# Save results to file
df.to_csv(r'C:\your-folder\your-file2.csv', index=False, encoding='utf-8')

If you’re not familiar with getting started with Python then this tutorial will help: